#full_data <- readRDS('../data/full_data_20230930.rds')
hourly_full <- readRDS('../data/hourly_full_20230930.rds')
daily_full <- readRDS('../data/daily_full_20230930.rds')
daily_full <- daily_full %>%
filter(!is.na(H2S_daily_avg))
hourly_full <- hourly_full %>%
filter(!is.na(H2S_hourly_avg))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4850693 259.1 8355088 446.3 NA 7392967 394.9
## Vcells 19281450 147.2 37136747 283.4 24576 31973264 244.0
# full_data <- full_data %>%
# filter(!is.na(H2S)) %>%
# select(-starts_with('daily_'), -starts_with('H2S_daily'),
# -all_of(c('Ammonia', 'Benzene', 'Black Carbon', 'DST', 'utm_x', 'utm_y',
# 'county')))
# gc()
monitor_names <- c("ElSegundo" = "El Segundo",
"StAnthony" = "St. Anthony",
"Manhattan" = "Manhattan",
"WestHS" = "West HS",
"ElmAve" = "Elm Ave",
"NorthHS" = "North HS",
"GuenserPark" = "Guenser Park",
"Chico" = "213th & Chico",
"Judson" = "Judson",
"HarborPark" = "Harbor Park",
"FirstMethodist" = "First Methodist",
"GStreet" = "G Street",
"StLuke" = "St. Luke",
"Hudson" = "Hudson",
"InnerPort" = "Inner Port")
base_monitor_stat <- daily_full %>%
group_by(Monitor) %>%
summarise('Start Date' = strftime(min(day), '%Y-%m-%d'),
'End Date' = strftime(max(day), '%Y-%m-%d'),
'Closest Refinery' = unique(closest_ref),
'Distance to Nearest Refinery (m)' = round(unique(dist_ref)),
'Angle to Refinery' = unique(angle_ref),
'Distance to Nearest WRP (m)' = round(unique(dist_wrp)),
'Capacity of Nearest WRP' = unique(closest_wrp_capacity),
'Angle to WRP' = round(unique(angle_wrp)),
'Distance to Dominguez Channel (m)' = round(unique(dist_dc)),
'Elevation' = unique(elevation),
'Enhanced Vegetation Index' = unique(EVI)) %>%
mutate(`Closest Refinery` = case_when(`Closest Refinery` == "Phillips 66 (Wilmington)" ~ "Phillips 66",
`Closest Refinery` == "Torrance Refinery" ~ "Torrance",
`Closest Refinery` == "Valero Refinery" ~ "Valero",
`Closest Refinery` == "Marathon (Carson)" ~ "Marathon Carson",
`Closest Refinery` == "Marathon (Wilmington)" ~ "Marathon Wilmington",
.default = `Closest Refinery`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(base_monitor_stat, digits = 2, format = 'html')
| Monitor | Start Date | End Date | Closest Refinery | Distance to Nearest Refinery (m) | Angle to Refinery | Distance to Nearest WRP (m) | Capacity of Nearest WRP | Angle to WRP | Distance to Dominguez Channel (m) | Elevation | Enhanced Vegetation Index |
|---|---|---|---|---|---|---|---|---|---|---|---|
| El Segundo | 2020-01-01 | 2020-08-13 | Chevron El Segundo | 1174 | 190.87 | 2810 | 850 | 294 | 6437 | 60 | 0.17 |
| St. Anthony | 2020-04-23 | 2023-09-30 | Chevron El Segundo | 970 | 188.57 | 2825 | 850 | 298 | 6543 | 44 | 0.17 |
| Manhattan | 2020-03-24 | 2023-09-30 | Chevron El Segundo | 2341 | 341.46 | 5462 | 850 | 325 | 6145 | 42 | 0.19 |
| West HS | 2020-01-01 | 2022-04-29 | Torrance | 3536 | 85.69 | 9333 | 400 | 123 | 1547 | 36 | 0.15 |
| Elm Ave | 2020-01-01 | 2023-09-30 | Torrance | 1362 | 5.34 | 5966 | 400 | 132 | 3955 | 32 | 0.07 |
| North HS | 2020-01-01 | 2022-04-29 | Torrance | 1779 | 160.72 | 8569 | 400 | 145 | 4858 | 24 | 0.15 |
| Guenser Park | 2020-04-23 | 2023-09-30 | Torrance | 2400 | 220.25 | 7702 | 400 | 159 | 375 | 16 | 0.14 |
| 213th & Chico | 2021-10-14 | 2022-01-28 | Marathon Carson | 2879 | 145.62 | 4297 | 400 | 213 | 50 | 7 | 0.12 |
| Judson | 2020-02-25 | 2023-09-30 | Marathon Carson | 2715 | 112.45 | 2692 | 400 | 213 | 1481 | 13 | 0.14 |
| Harbor Park | 2020-01-01 | 2023-09-30 | Phillips 66 | 1463 | 183.71 | 2012 | 400 | 6 | 4262 | 12 | 0.60 |
| First Methodist | 2020-03-04 | 2023-09-30 | Phillips 66 | 1124 | 205.55 | 2456 | 400 | 355 | 3792 | 14 | 0.21 |
| G Street | 2021-01-20 | 2023-09-30 | Phillips 66 | 717 | 222.51 | 2940 | 400 | 356 | 3748 | 8 | 0.09 |
| St. Luke | 2020-02-18 | 2023-09-30 | Marathon Carson | 2768 | 260.38 | 6910 | 400 | 256 | 1790 | 10 | 0.17 |
| Hudson | 2020-01-01 | 2023-09-30 | Marathon Wilmington | 1378 | 240.55 | 5920 | 400 | 272 | 705 | 8 | 0.14 |
| Inner Port | 2020-04-22 | 2023-09-30 | Valero | 2022 | 260.92 | 5970 | 15 | 228 | 1937 | 5 | 0.04 |
sincefeb2022_stat <- daily_full %>%
filter(day > '2022-01-31') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 608 | 140.15 | 3.41 | 2.45 | 0.52 | 7.83 | 236.36 | 4.65 | 1.96 |
| Manhattan | 599 | 6.82 | 1.40 | 0.82 | 0.39 | 3.40 | 233.30 | 0.04 | 0.00 |
| West HS | 88 | 2.90 | 0.88 | 1.09 | 0.40 | 6.24 | 272.30 | 0.03 | 3.00 |
| Elm Ave | 608 | 5.10 | 3.10 | 1.60 | 0.89 | 4.53 | 243.43 | 0.04 | 2.00 |
| North HS | 89 | 4.00 | 1.61 | 1.37 | 0.51 | 5.36 | 240.86 | 0.00 | 0.00 |
| Guenser Park | 546 | 7.66 | 2.66 | 1.76 | 0.80 | 4.43 | 260.55 | 0.02 | 0.00 |
| Judson | 583 | 9.75 | 2.80 | 1.69 | 0.54 | 3.80 | 272.41 | 0.53 | 0.00 |
| Harbor Park | 414 | 9.65 | 2.48 | 1.85 | 0.50 | 2.89 | 296.16 | 0.07 | 44.30 |
| First Methodist | 606 | 14.72 | 2.34 | 2.33 | 0.71 | 3.27 | 269.13 | 0.11 | 29.22 |
| G Street | 607 | 39.18 | 2.72 | 3.82 | 0.83 | 4.96 | 277.89 | 0.11 | 19.36 |
| St. Luke | 602 | 11.62 | 3.43 | 2.25 | 0.74 | 3.66 | 291.26 | 0.23 | 5.00 |
| Hudson | 599 | 98.17 | 6.08 | 2.95 | 0.99 | 3.49 | 39.57 | 0.23 | 2.70 |
| Inner Port | 582 | 53.70 | 5.39 | 6.31 | 0.87 | 5.07 | 227.02 | 0.05 | 83.81 |
disaster_stat <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(disaster_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 92 | 20.21 | 2.41 | 3.01 | 0.69 | 7.07 | 343.87 | 2.96 | 3.00 |
| Manhattan | 92 | 25.45 | 4.06 | 3.48 | 1.51 | 2.74 | 212.03 | 0.11 | 0.00 |
| West HS | 92 | 41.50 | 4.87 | 3.50 | 0.74 | 3.95 | 239.76 | 0.53 | 3.00 |
| Elm Ave | 92 | 63.50 | 8.15 | 7.73 | 1.34 | 3.77 | 236.77 | 0.53 | 2.00 |
| North HS | 92 | 98.50 | 7.88 | 6.93 | 1.15 | 3.74 | 235.24 | 0.46 | 0.00 |
| Guenser Park | 92 | 211.67 | 14.48 | 14.64 | 1.63 | 3.51 | 276.58 | 0.46 | 0.00 |
| 213th & Chico | 79 | 13407.18 | 1639.53 | 1025.43 | 128.07 | 3.55 | 293.63 | 17.15 | 0.00 |
| Judson | 92 | 742.25 | 69.08 | 59.79 | 6.86 | 2.77 | 294.11 | 31.71 | 0.00 |
| Harbor Park | 92 | 75.93 | 9.46 | 8.51 | 1.20 | 2.30 | 306.88 | 0.49 | 42.99 |
| First Methodist | 92 | 149.47 | 14.11 | 10.92 | 1.74 | 2.62 | 296.60 | 1.64 | 30.33 |
| G Street | 92 | 48.67 | 6.61 | 9.15 | 1.75 | 3.58 | 334.25 | 1.64 | 18.33 |
| St. Luke | 92 | 119.72 | 12.26 | 10.91 | 2.43 | 3.07 | 351.66 | 0.91 | 3.00 |
| Hudson | 92 | 192.64 | 19.45 | 16.72 | 3.36 | 2.53 | 13.51 | 0.91 | 2.34 |
| Inner Port | 90 | 136.50 | 12.21 | 15.91 | 3.18 | 3.47 | 9.29 | 0.09 | 86.34 |
# Try only the october for the prediction map
disaster_oct_stat <- daily_full %>%
filter(year == '2021', month == '10') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(disaster_oct_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 31 | 20.21 | 2.41 | 3.89 | 0.73 | 7.49 | 304.48 | 6.39 | 3 |
| Manhattan | 31 | 25.45 | 4.06 | 5.47 | 1.77 | 2.91 | 225.92 | 0.29 | 0 |
| West HS | 31 | 41.50 | 4.87 | 7.56 | 1.06 | 4.35 | 231.91 | 1.42 | 3 |
| Elm Ave | 31 | 63.50 | 8.15 | 18.37 | 2.37 | 4.16 | 238.62 | 1.42 | 2 |
| North HS | 31 | 98.50 | 7.88 | 16.83 | 2.02 | 4.11 | 236.18 | 1.32 | 0 |
| Guenser Park | 31 | 211.67 | 14.48 | 37.77 | 3.12 | 3.88 | 263.18 | 1.32 | 0 |
| 213th & Chico | 18 | 13407.18 | 1639.53 | 4337.41 | 536.21 | 4.41 | 260.39 | 49.44 | 0 |
| Judson | 31 | 742.25 | 69.08 | 167.25 | 17.18 | 2.91 | 290.90 | 79.10 | 0 |
| Harbor Park | 31 | 75.93 | 9.46 | 20.66 | 2.63 | 2.29 | 301.72 | 1.23 | 40 |
| First Methodist | 31 | 149.47 | 14.11 | 25.88 | 3.25 | 2.67 | 288.59 | 4.65 | 29 |
| G Street | 31 | 47.27 | 6.61 | 14.01 | 2.41 | 3.65 | 323.75 | 4.65 | 17 |
| St. Luke | 31 | 119.72 | 12.26 | 22.38 | 4.26 | 3.23 | 330.94 | 1.77 | 3 |
| Hudson | 31 | 192.64 | 19.45 | 39.27 | 5.95 | 2.76 | 343.90 | 1.77 | 2 |
| Inner Port | 31 | 136.50 | 12.21 | 22.60 | 3.79 | 3.87 | 325.23 | 0.26 | 87 |
normal_stat <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(normal_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| El Segundo | 223 | 9.24 | 3.67 | 1.70 | 0.57 | 7.92 | 255.36 | 0.25 | 2.19 |
| St. Anthony | 1160 | 140.15 | 3.43 | 2.64 | 0.57 | 7.76 | 243.55 | 4.63 | 1.98 |
| Manhattan | 1180 | 6.82 | 2.38 | 1.03 | 0.53 | 4.02 | 241.77 | 0.06 | 0.00 |
| West HS | 755 | 18.50 | 3.01 | 1.54 | 0.60 | 5.40 | 247.07 | 0.04 | 1.91 |
| Elm Ave | 1268 | 11.40 | 3.20 | 1.87 | 0.92 | 4.53 | 242.45 | 0.04 | 1.86 |
| North HS | 758 | 30.60 | 2.71 | 1.91 | 1.02 | 4.62 | 241.06 | 0.01 | 0.00 |
| Guenser Park | 1073 | 16.18 | 2.66 | 1.50 | 0.62 | 4.55 | 259.26 | 0.02 | 0.00 |
| 213th & Chico | 28 | 14.27 | 3.92 | 4.78 | 3.12 | 2.34 | 312.27 | 2.75 | 0.00 |
| Judson | 1186 | 15.78 | 3.63 | 1.99 | 0.72 | 4.41 | 266.20 | 0.43 | 0.00 |
| Harbor Park | 1018 | 17.42 | 2.67 | 1.76 | 0.47 | 3.37 | 283.50 | 0.07 | 46.82 |
| First Methodist | 1208 | 20.91 | 4.18 | 2.42 | 0.76 | 3.81 | 263.94 | 0.16 | 30.09 |
| G Street | 883 | 382.76 | 18.66 | 4.13 | 0.83 | 4.74 | 297.94 | 0.13 | 19.30 |
| St. Luke | 1177 | 11.62 | 3.43 | 2.09 | 0.69 | 3.94 | 270.65 | 0.16 | 3.86 |
| Hudson | 1266 | 98.17 | 6.08 | 2.87 | 0.98 | 3.73 | 254.80 | 0.15 | 3.15 |
| Inner Port | 1139 | 53.70 | 5.39 | 5.54 | 0.84 | 4.82 | 229.05 | 0.05 | 88.32 |
# Try only the Oct 2022
normal_oct_stat <- daily_full %>%
filter(year == '2022' & month == '10') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within zipcode' = mean(num_odor_complaints, na.rm=T),
'Active wells 2km' = mean(active_2km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`)) %>%
mutate(Monitor = str_replace_all(Monitor, monitor_names)) %>%
arrange(factor(Monitor, levels = unname(monitor_names)))
knitr::kable(normal_oct_stat, digits = 2, format = 'html')
| Monitor | Daily observations | Max Daily Max | Max Daily Average | Avg Daily Max | Avg Daily Average | Average Wind Speed | Average Wind Direction | Average daily odor complaints within zipcode | Active wells 2km |
|---|---|---|---|---|---|---|---|---|---|
| St. Anthony | 31 | 4.24 | 1.54 | 2.12 | 0.98 | 6.65 | 259.53 | 14.13 | 3 |
| Manhattan | 23 | 1.95 | 0.89 | 0.96 | 0.46 | 2.68 | 224.60 | 0.00 | 0 |
| Elm Ave | 31 | 3.48 | 1.59 | 1.70 | 1.01 | 3.45 | 239.50 | 0.19 | 2 |
| Guenser Park | 31 | 6.50 | 2.02 | 2.59 | 1.23 | 3.14 | 266.04 | 0.00 | 0 |
| Judson | 31 | 2.97 | 0.63 | 0.77 | 0.25 | 2.59 | 284.61 | 1.16 | 0 |
| Harbor Park | 28 | 7.04 | 0.90 | 2.25 | 0.42 | 2.24 | 293.73 | 0.29 | 45 |
| First Methodist | 31 | 8.42 | 2.09 | 2.60 | 0.73 | 2.66 | 275.98 | 0.26 | 30 |
| G Street | 31 | 11.77 | 1.73 | 4.07 | 1.00 | 3.50 | 331.32 | 0.26 | 20 |
| St. Luke | 31 | 6.40 | 1.64 | 2.01 | 0.51 | 2.78 | 8.03 | 0.65 | 5 |
| Hudson | 29 | 13.34 | 2.12 | 3.18 | 1.20 | 2.71 | 57.46 | 0.62 | 3 |
| Inner Port | 31 | 52.63 | 2.07 | 6.95 | 0.80 | 3.02 | 161.12 | 0.00 | 80 |
table1 <- base_monitor_stat %>%
select(-c(`Angle to Refinery`, `Angle to WRP`, `Capacity of Nearest WRP`)) %>%
left_join(disaster_stat %>%
select(Monitor, `Avg Daily Average`) %>%
rename(`Disaster Avg Daily Average` = `Avg Daily Average`),
join_by(Monitor)) %>%
left_join(normal_stat %>%
select(Monitor, `Avg Daily Average`, `Average daily odor complaints within zipcode`) %>%
rename(`Normal Avg Daily Average` = `Avg Daily Average`,
`Normal Avg Daily odor complaints` = `Average daily odor complaints within zipcode`),
join_by(Monitor)) %>%
mutate(`Closest Refinery` = paste0(`Closest Refinery`, ' (', round(`Distance to Nearest Refinery (m)`/1000, 1), ')'),
'#' = 1:n()) %>%
select(-`Distance to Nearest Refinery (m)`) %>%
relocate('#', Monitor, `Start Date`, `End Date`, `Closest Refinery`, `Normal Avg Daily Average`, `Disaster Avg Daily Average`, `Normal Avg Daily odor complaints`)
table1_kable <- knitr::kable(table1, format = 'latex', digits = 2)
writeLines(table1_kable, '../figures/table1.tex')
knitr::kable(table1, format = 'html', digits = 2)
| # | Monitor | Start Date | End Date | Closest Refinery | Normal Avg Daily Average | Disaster Avg Daily Average | Normal Avg Daily odor complaints | Distance to Nearest WRP (m) | Distance to Dominguez Channel (m) | Elevation | Enhanced Vegetation Index |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | El Segundo | 2020-01-01 | 2020-08-13 | Chevron El Segundo (1.2) | 0.57 | NA | 0.25 | 2810 | 6437 | 60 | 0.17 |
| 2 | St. Anthony | 2020-04-23 | 2023-09-30 | Chevron El Segundo (1) | 0.57 | 0.69 | 4.63 | 2825 | 6543 | 44 | 0.17 |
| 3 | Manhattan | 2020-03-24 | 2023-09-30 | Chevron El Segundo (2.3) | 0.53 | 1.51 | 0.06 | 5462 | 6145 | 42 | 0.19 |
| 4 | West HS | 2020-01-01 | 2022-04-29 | Torrance (3.5) | 0.60 | 0.74 | 0.04 | 9333 | 1547 | 36 | 0.15 |
| 5 | Elm Ave | 2020-01-01 | 2023-09-30 | Torrance (1.4) | 0.92 | 1.34 | 0.04 | 5966 | 3955 | 32 | 0.07 |
| 6 | North HS | 2020-01-01 | 2022-04-29 | Torrance (1.8) | 1.02 | 1.15 | 0.01 | 8569 | 4858 | 24 | 0.15 |
| 7 | Guenser Park | 2020-04-23 | 2023-09-30 | Torrance (2.4) | 0.62 | 1.63 | 0.02 | 7702 | 375 | 16 | 0.14 |
| 8 | 213th & Chico | 2021-10-14 | 2022-01-28 | Marathon Carson (2.9) | 3.12 | 128.07 | 2.75 | 4297 | 50 | 7 | 0.12 |
| 9 | Judson | 2020-02-25 | 2023-09-30 | Marathon Carson (2.7) | 0.72 | 6.86 | 0.43 | 2692 | 1481 | 13 | 0.14 |
| 10 | Harbor Park | 2020-01-01 | 2023-09-30 | Phillips 66 (1.5) | 0.47 | 1.20 | 0.07 | 2012 | 4262 | 12 | 0.60 |
| 11 | First Methodist | 2020-03-04 | 2023-09-30 | Phillips 66 (1.1) | 0.76 | 1.74 | 0.16 | 2456 | 3792 | 14 | 0.21 |
| 12 | G Street | 2021-01-20 | 2023-09-30 | Phillips 66 (0.7) | 0.83 | 1.75 | 0.13 | 2940 | 3748 | 8 | 0.09 |
| 13 | St. Luke | 2020-02-18 | 2023-09-30 | Marathon Carson (2.8) | 0.69 | 2.43 | 0.16 | 6910 | 1790 | 10 | 0.17 |
| 14 | Hudson | 2020-01-01 | 2023-09-30 | Marathon Wilmington (1.4) | 0.98 | 3.36 | 0.15 | 5920 | 705 | 8 | 0.14 |
| 15 | Inner Port | 2020-04-22 | 2023-09-30 | Valero (2) | 0.84 | 3.18 | 0.05 | 5970 | 1937 | 5 | 0.04 |
hourly_responses <- c('H2S_hourly_avg', 'H2S_hourly_max')
# since feb 2022
daily_data_sincefeb2022 <- daily_full %>% filter(day > '2022-01-31')
hourly_data_sincefeb2022 <- hourly_full %>% filter(day > '2022-01-31')
# Disaster
daily_data_dis <- daily_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
hourly_data_dis <- hourly_full %>% filter(year == '2021', month %in% c('10', '11', '12'))
# Exclude disaster stepwise
daily_data_excl_dis <- daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
hourly_data_excl_dis <- hourly_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12')))
# Everything w. disaster indicator
daily_data_dis_ind <- daily_full %>%
mutate(disaster =
if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
hourly_data_dis_ind <- hourly_full %>%
mutate(disaster =
if_else(year == '2021', month %in% c('10', '11', '12'), 1, 0))
daily_responses <- c('H2S_daily_avg', 'log(H2S_daily_avg)',
'H2S_daily_max', 'log(H2S_daily_max)')
hourly_responses <- c('H2S_hourly_avg', 'log(H2S_hourly_avg)',
'H2S_hourly_max', 'log(H2S_hourly_max)')
dateranges <- c('sincefeb2022', 'dis', 'excl_dis', 'dis_ind', 'full')
smooth <- c("s(as.numeric(month),bs='cc')",
"s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
"te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
smooth_tibble <- tibble(features = c(list(c(smooth[1])),
list(c(smooth[2])),
list(c(smooth[3])),
list(c(smooth[1:2])),
list(c(smooth[c(1, 3)])),
list(c(smooth[2:3])),
list(c(smooth[1:3]))),
disaster_applicable = c(0, 1, 1, 0, 0, 1, 0))
# smooth_compare <- crossing(response = c(daily_responses, hourly_responses),
# daterange = dateranges) %>%
# cross_join(smooth_tibble) %>%
# mutate(GCV = NA) %>%
# filter(!(daterange == 'dis' & disaster_applicable == 0)) %>%
# select(-disaster_applicable)
#
# for (i in 1:nrow(smooth_compare)) {
# features <- unname(unlist(smooth_compare[i,'features']))
# formula_feature_str <- paste(features, collapse = ' + ')
# formula_str <- paste(smooth_compare[[i, 'response']], formula_feature_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
#
# if (smooth_compare[[i, 'response']] %in% hourly_responses &
# smooth_compare[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_compare[[i, 'response']] %in% hourly_responses){
# data <- get(paste0('hourly_data_', smooth_compare[[i, 'daterange']]))
# } else if (smooth_compare[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_compare[[i, 'daterange']]))
# }
#
# summary <- summary(gam(formula, data = data, method = 'GCV.Cp', select = TRUE))
# GCV_new <- summary$sp.criterion[[1]]
# smooth_compare[i, 'GCV'] <- GCV_new
# print(str_glue('Completed {i} iterations'))
# gc()
# }
#
# smooth_compare <- smooth_compare %>%
# group_by(response, daterange) %>%
# mutate(best = if_else(GCV == min(GCV), 1, 0)) %>%
# mutate(rounded_GCV = round(GCV, 2)) %>%
# rowwise() %>%
# mutate(month_smooth = if_else(smooth[1] %in% unlist(features), 1, 0),
# coord_smooth = if_else(smooth[2] %in% unlist(features), 1, 0),
# coord_day_3D_smooth = if_else(smooth[3] %in% unlist(features), 1, 0)) %>%
# ungroup()
# saveRDS(smooth_compare, 'smooth_compare.rds')
smooth_compare <- readRDS('smooth_compare.rds')
# get best smooth models for diff response and daterange
best_smooth <- smooth_compare %>%
group_by(response, daterange) %>%
filter(GCV == min(GCV)) %>%
ungroup() %>%
select(response, daterange, GCV, features, month_smooth, coord_smooth, coord_day_3D_smooth)
best_smooth
response_names <- c('da', 'log_da', 'dm', 'log_dm', 'ha', 'log_ha', 'hm', 'log_hm')
# smooth_models <- tibble(name = response_names,
# response = c(daily_responses, hourly_responses)) %>%
# crossing(tibble(daterange = dateranges)) %>%
# mutate(name = paste(name, daterange, sep = '_'))
smooth_predictors <-
c("s(as.numeric(month),bs='cc')",
"s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10)",
"te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day), k=c(10,10),d=c(2,1),bs=c('tp','cc'))")
# smooth_models$residuals <- rep(list(c()), nrow(smooth_models))
#
# for (i in 1:nrow(smooth_models)) {
# # first, get residuals of smooth on response
# if (smooth_models$daterange[i] == 'dis') {
# formula_feature_str <- paste(smooth_predictors[c(2, 3)], collapse = ' + ')
# } else {
# formula_feature_str <- paste(smooth_predictors, collapse = ' + ')
# }
#
# if (smooth_models[[i, 'response']] %in% hourly_responses &
# smooth_models[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
# data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
# } else if (smooth_models[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
# }
#
# formula_str <- paste(smooth_models[[i, 'response']], formula_feature_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
# residuals <- gam(formula, data = data, method = 'GCV.Cp')$residuals
# smooth_models$residuals[i] <- list(residuals)
# }
# saveRDS(smooth_models, 'smooth_models.rds')
smooth_models <- readRDS('smooth_models.rds')
disaster_predictors <- c('month', 'weekday', 'wd_avg', 'ws_avg',
'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
'monthly_oil_2km', 'monthly_gas_2km', 'active_2km',
'inactive_2km', 'elevation', 'EVI',
'num_odor_complaints', 'closest_wrp_capacity')
everything_predictors <- c('year', 'weekday', 'wd_avg', 'ws_avg',
'I(1/dist_wrp^2)', 'I(1/dist_ref^2)', 'I(1/dist_dc^2)',
'monthly_oil_2km', 'monthly_gas_2km', 'active_2km',
'inactive_2km', 'elevation', 'EVI',
'num_odor_complaints', 'closest_wrp_capacity')
daily_predictors <- c('daily_downwind_ref', 'daily_downwind_wrp', 'daily_temp',
'daily_hum', 'daily_precip')
hourly_predictors <- c('hourly_downwind_ref', 'hourly_downwind_wrp', 'hourly_temp',
'hourly_hum', 'hourly_precip')
disaster_linear_pred_str <- paste(disaster_predictors, collapse = ' + ')
everything_linear_pred_str <- paste(everything_predictors, collapse = ' + ')
daily_linear_pred_str <- paste(daily_predictors, collapse = ' + ')
hourly_linear_pred_str <- paste(hourly_predictors, collapse = ' + ')
# for (i in 1:nrow(smooth_models)) {
# if (smooth_models[[i, 'daterange']] == 'dis_ind' &
# smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(everything_linear_pred_str,
# hourly_linear_pred_str,
# 'disaster', sep = ' + ')
# } else if (smooth_models[[i, 'daterange']] == 'dis' &
# smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(disaster_linear_pred_str,
# hourly_linear_pred_str, sep = ' + ')
# } else if (smooth_models[[i, 'response']] %in% hourly_responses) {
# formula_str <- paste(everything_linear_pred_str,
# hourly_linear_pred_str, sep = ' + ')
# } else if (smooth_models[[i, 'daterange']] == 'dis_ind') {
# formula_str <- paste(everything_linear_pred_str,
# daily_linear_pred_str,
# 'disaster', sep = ' + ')
# } else if (smooth_models[[i, 'daterange']] == 'dis') {
# formula_str <- paste(disaster_linear_pred_str,
# daily_linear_pred_str, sep = ' + ')
# } else {
# formula_str <- paste(everything_linear_pred_str,
# daily_linear_pred_str, sep = ' + ')
# }
# formula_str <- paste('residuals', formula_str, sep = ' ~ ')
# formula <- as.formula(formula_str)
#
# if (smooth_models[[i, 'response']] %in% hourly_responses &
# smooth_models[[i, 'daterange']] == 'full') {
# data <- hourly_full
# } else if (smooth_models[[i, 'response']] %in% hourly_responses){
# data <- get(paste0('hourly_data_', smooth_models[[i, 'daterange']]))
# } else if (smooth_models[[i, 'daterange']] == 'full') {
# data <- daily_full
# } else {
# data <- get(paste0('daily_data_', smooth_models[[i, 'daterange']]))
# }
#
# data$residuals <- unlist(smooth_models$residuals[i])
# regsubsets <- regsubsets(formula, data, nvmax = Inf)
# assign(paste0(smooth_models$name[i], '_regsubsets'), regsubsets)
# print(str_glue('Completed {i} rows'))
# }
#
# for (i in 1:nrow(smooth_models)) {
# saveRDS(get(paste0(smooth_models$name[i], '_regsubsets')),
# paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds'))
# }
# read regsubsets
for (i in 1:nrow(smooth_models)) {
assign(paste0(smooth_models$name[i], '_regsubsets'),
readRDS(paste0('regsubsets/', smooth_models$name[i], '_regsubsets.rds')))
}
# best_model_sizes <- smooth_models %>%
# group_by(name, response, daterange) %>%
# summarise(mean_smooth_res = mean(unlist(residuals)),
# var_smooth_res = round(var(unlist(residuals))), 4) %>%
# select(any_of(c('name', 'response', 'daterange',
# 'mean_smooth_res', 'var_smooth_res'))) %>%
# mutate(Adj.R2 = NA,
# best_R2 = NA,
# CP = NA,
# best_CP = NA,
# BIC = NA,
# best_BIC = NA,
# linear_features = NA)
# for (i in 1:nrow(smooth_models)) {
# regsubset <- summary(get(paste0(smooth_models$name[i], '_regsubsets')))
# best_sizes <- tibble(Adj.R2 = which.max(regsubset$adjr2),
# best_R2 = regsubset$adjr2[which.max(regsubset$adjr2)],
# CP = which.min(regsubset$cp),
# best_CP = regsubset$cp[which.max(regsubset$cp)],
# BIC = which.min(regsubset$bic),
# best_BIC = regsubset$bic[which.max(regsubset$bic)],)
# best_features <- tibble(linear_features = list(setdiff(names(regsubset$which[best_sizes$CP, ]
# [unlist(regsubset$which[best_sizes$CP, ])]), '(Intercept)')))
# best_model_sizes[i,] <- bind_cols(best_model_sizes[i, 1:5], best_sizes, best_features)
# }
#
# best_models <- best_model_sizes %>%
# rowwise() %>%
# mutate(smooth_features = if_else(daterange == 'dis', list(smooth_predictors[2:3]),
# list(smooth_predictors))) %>%
# mutate(full_features = list(c(unlist(smooth_features), unlist(linear_features)))) %>%
# ungroup()
#
# saveRDS(best_models, 'best_gam_models.rds')
best_gam_models <- readRDS('best_gam_models.rds')
# for (model_table_name in model_table_names) {
# model_table <- get(model_table_name)
# write_csv(model_table, paste0('step_gam_tables/', model_table_name, '.csv'))
# }
# read model tables
# for (model_table_name in model_table_names) {
# assign(model_table_name,
# read_csv(paste0('step_gam_tables/', model_table_name, '.csv')))
# }
# model_features_table <- tibble(model_name = character(),
# GCV = numeric(),
# p = numeric(),
# features = list())
#
# # for each model, find the best set of predictors
# for (model in model_table_names) {
# step_gam_table <- get(model)
# best_model <- step_gam_table[nrow(step_gam_table), ]
# best_model_features <-names(best_model)[as.logical(c(0, unname(unlist(c(best_model[1,-1])))))]
# model_features_table <- rbind(model_features_table,
# tibble(model_name = model,
# GCV = best_model$GCV,
# p = length(best_model_features),
# features = list(best_model_features)))
# }
#
# meta <- expand.grid(stat, init, date)
# names(meta) <- c('stat', 'init', 'date')
# model_features_table <- cbind(meta, model_features_table)
#
# saveRDS(model_features_table, 'step_gam_tables/model_features_table.rds')
# model_features_table <- readRDS('step_gam_tables/model_features_table.rds')
# write function that takes in response, predictors, data and returns gam model
# different from stepwise function, this has select = FALSE
get_feature_vector <- function(response, daterange) {
feature_vec <- best_gam_models %>%
filter(response == .env$response & daterange == .env$daterange) %>%
pull(full_features) %>%
unlist()
feature_vec <- str_replace_all(feature_vec, 'month\\d+', 'month')
feature_vec <- str_replace_all(feature_vec, 'year\\d+', 'month')
feature_vec <- str_replace_all(feature_vec, 'weekday\\D+', 'weekday')
feature_vec <- unique(feature_vec)
return(feature_vec)
}
get_data <- function(response, daterange) {
if (response %in% hourly_responses &
daterange == 'full') {
data <- hourly_full
} else if (response %in% hourly_responses){
data <- get(paste0('hourly_data_', daterange))
} else if (daterange == 'full') {
data <- daily_full
} else {
data <- get(paste0('daily_data_', daterange))
}
return(data)
}
get_gam_model <- function(response, daterange) {
predictors <- get_feature_vector(response, daterange)
formula_feature_str <- paste(predictors, collapse = ' + ')
formula_str <- paste(response, formula_feature_str, sep = ' ~ ')
formula <- as.formula(formula_str)
data <- get_data(response, daterange)
gam_model <- gam(formula, data = data, method = 'GCV.Cp')
return(gam_model)
}
# for (i in 1:nrow(best_gam_models)) {
# model <- get_gam_model(best_gam_models$response[i],
# best_gam_models$daterange[i])
# assign(paste0(best_gam_models$name[i], '_', best_gam_models$daterange[i], '_gam'), model)
# saveRDS(model, paste0('gam_models/', paste0(best_gam_models$name[i], '_gam.rds')))
# }
# # compare empty init vs full init and find best ones
# final_model_features_table <- model_features_table %>%
# group_by(stat, date) %>%
# filter(GCV == min(GCV))
#
# final_model_features_table <- final_model_features_table %>%
# select(-c(init, model_name)) %>%
# distinct()
for (i in 1:nrow(best_gam_models)) {
assign(paste0(best_gam_models$name[i], '_gam'),
readRDS(paste0('gam_models/', best_gam_models$name[i], '_gam.rds')))
}
Since February 2022
# Since feb 2022
summary(da_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_gas_2km + elevation + EVI + num_odor_complaints +
## daily_downwind_ref + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.612e+00 2.532e+00 1.032 0.3023
## month02 -1.239e-01 1.518e+00 -0.082 0.9349
## month03 -9.521e-02 2.734e+00 -0.035 0.9722
## month04 -8.055e-02 3.649e+00 -0.022 0.9824
## month05 -1.351e-01 4.262e+00 -0.032 0.9747
## month06 -2.392e-01 4.571e+00 -0.052 0.9583
## month07 -3.464e-01 4.571e+00 -0.076 0.9396
## month08 -3.837e-01 4.262e+00 -0.090 0.9283
## month09 -4.015e-01 3.649e+00 -0.110 0.9124
## month10 -4.329e-01 2.734e+00 -0.158 0.8742
## month11 -2.729e-01 1.518e+00 -0.180 0.8573
## month12 -1.949e-01 3.356e-02 -5.809 6.57e-09 ***
## weekdayMon 8.495e-02 1.462e-02 5.812 6.48e-09 ***
## weekdayTue 1.491e-01 1.459e-02 10.219 < 2e-16 ***
## weekdayWed 1.595e-01 1.463e-02 10.907 < 2e-16 ***
## weekdayThu 1.100e-01 1.464e-02 7.514 6.50e-14 ***
## weekdayFri 1.303e-01 1.460e-02 8.924 < 2e-16 ***
## weekdaySat 7.223e-02 1.459e-02 4.950 7.63e-07 ***
## wd_avg 2.842e-04 5.316e-05 5.346 9.29e-08 ***
## ws_avg -7.080e-02 2.740e-03 -25.839 < 2e-16 ***
## I(1/dist_wrp^2) 8.393e-07 3.713e-07 2.260 0.0238 *
## I(1/dist_ref^2) 1.526e-05 1.077e-05 1.418 0.1564
## I(1/dist_dc^2) -3.424e-04 1.078e-04 -3.175 0.0015 **
## monthly_gas_2km 4.028e-05 9.401e-06 4.285 1.85e-05 ***
## elevation -3.865e-02 4.501e-03 -8.587 < 2e-16 ***
## EVI -1.347e+00 6.496e-02 -20.737 < 2e-16 ***
## num_odor_complaints 9.286e-03 1.876e-03 4.949 7.63e-07 ***
## daily_downwind_ref -4.646e-03 1.619e-02 -0.287 0.7742
## daily_temp 2.071e-03 1.428e-03 1.451 0.1469
## daily_hum -1.024e-02 4.033e-04 -25.399 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -2.983e-10 0.000 Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.985e+00 8.999 39.13
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.976e+01 80.000 44.58
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/127
## R-sq.(adj) = 0.619 Deviance explained = 62.5%
## GCV = 0.10007 Scale est. = 0.098295 n = 6531
Disaster Only
# Disaster only
summary(da_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + weekday +
## wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.242e+01 1.240e+01 2.615 0.009040 **
## weekdayMon -6.532e+00 7.104e+00 -0.920 0.357985
## weekdayTue -1.077e+01 7.280e+00 -1.479 0.139444
## weekdayWed 4.488e-01 7.088e+00 0.063 0.949527
## weekdayThu -3.571e-01 7.076e+00 -0.050 0.959766
## weekdayFri 1.919e+00 6.917e+00 0.277 0.781522
## weekdaySat 1.194e+00 7.040e+00 0.170 0.865381
## wd_avg -6.137e-02 2.086e-02 -2.941 0.003332 **
## ws_avg 6.722e+00 1.791e+00 3.753 0.000183 ***
## I(1/dist_wrp^2) -1.034e-04 5.137e-05 -2.012 0.044470 *
## I(1/dist_ref^2) 1.197e-03 6.008e-04 1.993 0.046481 *
## I(1/dist_dc^2) 3.169e-01 1.510e-01 2.098 0.036104 *
## monthly_oil_2km -5.058e-03 1.771e-03 -2.857 0.004358 **
## inactive_2km 4.246e+00 1.757e+00 2.417 0.015818 *
## num_odor_complaints -1.072e+00 1.618e-01 -6.624 5.3e-11 ***
## daily_hum -2.316e-01 1.185e-01 -1.954 0.050880 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.313 8.838 3.189
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 67.903 80.000 8.117
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9e-04 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 102/105
## R-sq.(adj) = 0.417 Deviance explained = 45.7%
## GCV = 4760.8 Scale est. = 4427.1 n = 1273
Exclude Disaster
# Exclude disaster
summary(da_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.903e+00 2.123e+00 0.896 0.370002
## month02 -3.852e-01 1.304e+00 -0.295 0.767767
## month03 -4.816e-01 2.350e+00 -0.205 0.837619
## month04 -5.176e-01 3.137e+00 -0.165 0.868951
## month05 -5.757e-01 3.664e+00 -0.157 0.875163
## month06 -6.525e-01 3.929e+00 -0.166 0.868114
## month07 -5.844e-01 3.929e+00 -0.149 0.881776
## month08 -4.606e-01 3.664e+00 -0.126 0.899978
## month09 -4.104e-01 3.137e+00 -0.131 0.895914
## month10 -4.651e-01 2.350e+00 -0.198 0.843101
## month11 -1.895e-01 1.305e+00 -0.145 0.884495
## month12 -2.310e-02 2.398e-02 -0.963 0.335418
## weekdayMon 9.189e-02 1.362e-02 6.748 1.56e-11 ***
## weekdayTue 1.460e-01 1.357e-02 10.763 < 2e-16 ***
## weekdayWed 1.712e-01 1.356e-02 12.622 < 2e-16 ***
## weekdayThu 1.525e-01 1.357e-02 11.237 < 2e-16 ***
## weekdayFri 1.482e-01 1.357e-02 10.921 < 2e-16 ***
## weekdaySat 7.767e-02 1.358e-02 5.719 1.10e-08 ***
## ws_avg -2.946e-02 1.915e-03 -15.386 < 2e-16 ***
## I(1/dist_wrp^2) 8.190e-07 3.352e-07 2.443 0.014565 *
## I(1/dist_dc^2) 3.751e-04 2.586e-04 1.451 0.146851
## monthly_oil_2km 5.451e-06 4.299e-06 1.268 0.204854
## active_2km 6.908e-03 2.516e-03 2.746 0.006049 **
## inactive_2km -1.944e-03 5.861e-03 -0.332 0.740172
## elevation -1.011e-02 2.775e-03 -3.644 0.000270 ***
## EVI -1.521e+00 9.630e-02 -15.795 < 2e-16 ***
## num_odor_complaints 4.900e-03 1.046e-03 4.682 2.87e-06 ***
## daily_downwind_ref -3.168e-02 1.422e-02 -2.228 0.025913 *
## daily_temp 4.433e-03 1.226e-03 3.617 0.000299 ***
## daily_hum -8.872e-03 3.413e-04 -25.995 < 2e-16 ***
## daily_precip -7.195e-02 2.590e-02 -2.778 0.005476 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -3.984e-10 2 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 41.33
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.959e+01 80 54.34
## p-value
## s(as.numeric(month)) 0.82
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 126/128
## R-sq.(adj) = 0.457 Deviance explained = 46.1%
## GCV = 0.18888 Scale est. = 0.18733 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(da_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.287e+01 1.319e+02 -0.628 0.5299
## month02 -5.163e-01 8.574e+01 -0.006 0.9952
## month03 -8.525e-01 1.545e+02 -0.006 0.9956
## month04 -1.410e+00 2.062e+02 -0.007 0.9945
## month05 -1.750e+00 2.409e+02 -0.007 0.9942
## month06 -1.383e+00 2.583e+02 -0.005 0.9957
## month07 -1.344e+00 2.583e+02 -0.005 0.9958
## month08 -9.400e-01 2.409e+02 -0.004 0.9969
## month09 -1.929e-01 2.062e+02 -0.001 0.9993
## month10 3.427e+00 1.545e+02 0.022 0.9823
## month11 -3.611e+00 8.574e+01 -0.042 0.9664
## month12 -2.900e+00 1.226e+00 -2.366 0.0180 *
## wd_avg -1.032e-02 2.385e-03 -4.330 1.50e-05 ***
## I(1/dist_wrp^2) -2.146e-05 1.108e-05 -1.937 0.0528 .
## I(1/dist_ref^2) -4.456e-04 6.500e-04 -0.686 0.4930
## I(1/dist_dc^2) 7.790e-02 4.998e-02 1.559 0.1191
## monthly_oil_2km 1.170e-04 2.482e-04 0.472 0.6373
## active_2km 6.352e-01 1.113e-01 5.705 1.19e-08 ***
## elevation -6.513e-01 1.298e-01 -5.019 5.26e-07 ***
## EVI -2.944e+01 5.734e+00 -5.133 2.88e-07 ***
## num_odor_complaints 6.447e-01 3.070e-02 21.005 < 2e-16 ***
## closest_wrp_capacity 2.098e-01 3.392e-02 6.183 6.44e-10 ***
## daily_downwind_ref -3.222e+00 7.296e-01 -4.416 1.01e-05 ***
## daily_downwind_wrp 9.430e-01 7.885e-01 1.196 0.2317
## daily_hum -6.342e-02 1.496e-02 -4.238 2.26e-05 ***
## disaster 8.224e+00 1.733e+00 4.745 2.10e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.086e-09 2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.862e+00 8.962 6.140
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.394e+01 80.000 8.194
## p-value
## s(as.numeric(month)) 2.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 120/123
## R-sq.(adj) = 0.114 Deviance explained = 12%
## GCV = 558.81 Scale est. = 555.02 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(da_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.794e+01 1.320e+02 -0.590 0.5549
## month02 -6.650e-01 8.580e+01 -0.008 0.9938
## month03 -1.020e+00 1.546e+02 -0.007 0.9947
## month04 -1.627e+00 2.063e+02 -0.008 0.9937
## month05 -1.970e+00 2.410e+02 -0.008 0.9935
## month06 -1.521e+00 2.585e+02 -0.006 0.9953
## month07 -1.309e+00 2.585e+02 -0.005 0.9960
## month08 -7.157e-01 2.410e+02 -0.003 0.9976
## month09 1.062e-01 2.063e+02 0.001 0.9996
## month10 6.623e+00 1.546e+02 0.043 0.9658
## month11 -5.479e-01 8.580e+01 -0.006 0.9949
## month12 -5.173e-03 1.064e+00 -0.005 0.9961
## wd_avg -1.039e-02 2.386e-03 -4.353 1.35e-05 ***
## I(1/dist_wrp^2) -2.082e-05 1.117e-05 -1.864 0.0623 .
## I(1/dist_ref^2) -4.340e-04 6.515e-04 -0.666 0.5054
## I(1/dist_dc^2) 7.571e-02 5.013e-02 1.510 0.1310
## monthly_oil_2km 1.614e-04 2.484e-04 0.650 0.5157
## active_2km 6.139e-01 1.114e-01 5.513 3.59e-08 ***
## elevation -6.347e-01 1.298e-01 -4.888 1.03e-06 ***
## EVI -2.876e+01 5.740e+00 -5.010 5.51e-07 ***
## num_odor_complaints 6.535e-01 3.066e-02 21.314 < 2e-16 ***
## closest_wrp_capacity 2.115e-01 3.401e-02 6.219 5.12e-10 ***
## daily_downwind_ref -3.309e+00 7.298e-01 -4.534 5.83e-06 ***
## daily_downwind_wrp 9.697e-01 7.890e-01 1.229 0.2191
## daily_hum -6.741e-02 1.495e-02 -4.509 6.57e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -1.314e-09 2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.875e+00 8.966 6.216
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.389e+01 80.000 7.919
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 119/122
## R-sq.(adj) = 0.112 Deviance explained = 11.8%
## GCV = 559.54 Scale est. = 555.78 n = 15595
Since February 2022
# Since feb 2022
summary(log_da_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## inactive_2km + elevation + EVI + num_odor_complaints + daily_temp +
## daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.621e+00 6.273e-01 2.584 0.009778 **
## month02 -1.138e-01 3.602e-01 -0.316 0.752104
## month03 -6.208e-02 6.465e-01 -0.096 0.923512
## month04 6.151e-03 8.620e-01 0.007 0.994307
## month05 -4.032e-02 1.006e+00 -0.040 0.968027
## month06 -1.212e-01 1.078e+00 -0.112 0.910507
## month07 -2.156e-01 1.078e+00 -0.200 0.841512
## month08 -2.532e-01 1.005e+00 -0.252 0.801092
## month09 -3.077e-01 8.603e-01 -0.358 0.720634
## month10 -5.063e-01 6.463e-01 -0.783 0.433374
## month11 -3.737e-01 3.616e-01 -1.033 0.301503
## month12 -2.853e-01 4.146e-02 -6.882 6.44e-12 ***
## weekdayMon 1.230e-01 1.810e-02 6.794 1.19e-11 ***
## weekdayTue 2.077e-01 1.807e-02 11.494 < 2e-16 ***
## weekdayWed 2.044e-01 1.811e-02 11.282 < 2e-16 ***
## weekdayThu 1.482e-01 1.813e-02 8.177 3.48e-16 ***
## weekdayFri 1.691e-01 1.808e-02 9.352 < 2e-16 ***
## weekdaySat 1.139e-01 1.807e-02 6.303 3.11e-10 ***
## wd_avg 3.931e-04 6.582e-05 5.972 2.46e-09 ***
## ws_avg -1.032e-01 3.392e-03 -30.431 < 2e-16 ***
## I(1/dist_wrp^2) 4.383e-08 3.894e-07 0.113 0.910365
## I(1/dist_ref^2) 1.114e-06 5.251e-06 0.212 0.831928
## I(1/dist_dc^2) -2.247e-04 6.819e-05 -3.296 0.000987 ***
## inactive_2km 1.684e-02 4.975e-03 3.385 0.000716 ***
## elevation -4.086e-02 7.744e-03 -5.276 1.36e-07 ***
## EVI -1.907e+00 1.450e-01 -13.150 < 2e-16 ***
## num_odor_complaints 1.300e-02 2.323e-03 5.598 2.25e-08 ***
## daily_temp 2.309e-03 1.766e-03 1.308 0.190996
## daily_hum -1.301e-02 4.956e-04 -26.254 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -4.862e-12 3.000 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.659e+00 8.287 54.63
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.989e+01 80.000 77.10
## p-value
## s(as.numeric(month)) 1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 123/126
## R-sq.(adj) = 0.7 Deviance explained = 70.5%
## GCV = 0.15343 Scale est. = 0.15076 n = 6531
Disaster Only
# Disaster only
summary(log_da_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.399e+01 5.713e+00 -2.449 0.014450 *
## month11 5.086e-01 1.310e-01 3.882 0.000109 ***
## month12 4.853e-01 1.452e-01 3.343 0.000854 ***
## weekdayMon 4.504e-02 6.325e-02 0.712 0.476553
## weekdayTue -1.303e-01 6.478e-02 -2.012 0.044445 *
## weekdayWed 1.073e-01 6.281e-02 1.708 0.087868 .
## weekdayThu 7.894e-02 6.303e-02 1.252 0.210689
## weekdayFri -9.842e-02 6.144e-02 -1.602 0.109460
## weekdaySat -1.387e-01 6.204e-02 -2.236 0.025544 *
## ws_avg -1.428e-01 1.628e-02 -8.766 < 2e-16 ***
## I(1/dist_wrp^2) -1.877e-05 7.598e-06 -2.470 0.013655 *
## I(1/dist_ref^2) -1.698e-04 7.029e-05 -2.415 0.015875 *
## I(1/dist_dc^2) 1.556e-01 6.090e-02 2.556 0.010722 *
## monthly_oil_2km 3.410e-04 1.225e-04 2.783 0.005466 **
## active_2km -4.484e-01 4.093e-02 -10.956 < 2e-16 ***
## inactive_2km 9.736e-01 1.340e-01 7.263 6.83e-13 ***
## elevation -2.777e-01 3.584e-02 -7.748 1.99e-14 ***
## num_odor_complaints 8.276e-03 1.340e-03 6.177 8.98e-10 ***
## closest_wrp_capacity 4.577e-02 1.095e-02 4.182 3.11e-05 ***
## daily_downwind_ref -8.101e-02 5.618e-02 -1.442 0.149606
## daily_temp 9.549e-03 6.866e-03 1.391 0.164564
## daily_hum -8.024e-03 1.338e-03 -5.997 2.67e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 25.693
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 58.55 80 9.163
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 108/111
## R-sq.(adj) = 0.746 Deviance explained = 76.3%
## GCV = 0.36938 Scale est. = 0.34427 n = 1273
Exclude Disaster
# Exclude disaster
summary(log_da_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km +
## active_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## daily_downwind_ref + daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.639e-01 8.054e-01 0.203 0.8388
## month02 -3.120e-01 2.630e+00 -0.119 0.9056
## month03 -2.684e-01 3.316e+00 -0.081 0.9355
## month04 -1.519e-01 2.721e+00 -0.056 0.9555
## month05 -7.152e-02 2.205e+00 -0.032 0.9741
## month06 -9.384e-02 2.404e+00 -0.039 0.9689
## month07 -2.557e-02 2.390e+00 -0.011 0.9915
## month08 6.363e-02 2.178e+00 0.029 0.9767
## month09 -2.818e-02 2.708e+00 -0.010 0.9917
## month10 -3.782e-01 3.312e+00 -0.114 0.9091
## month11 -1.397e-01 2.628e+00 -0.053 0.9576
## month12 -3.944e-02 2.658e-02 -1.484 0.1378
## weekdayMon 1.204e-01 1.509e-02 7.984 1.53e-15 ***
## weekdayTue 1.830e-01 1.503e-02 12.175 < 2e-16 ***
## weekdayWed 2.079e-01 1.502e-02 13.835 < 2e-16 ***
## weekdayThu 1.912e-01 1.503e-02 12.721 < 2e-16 ***
## weekdayFri 1.900e-01 1.504e-02 12.632 < 2e-16 ***
## weekdaySat 1.055e-01 1.505e-02 7.009 2.51e-12 ***
## ws_avg -4.004e-02 2.121e-03 -18.876 < 2e-16 ***
## I(1/dist_wrp^2) 2.021e-06 3.719e-07 5.435 5.58e-08 ***
## I(1/dist_ref^2) -8.447e-05 1.742e-05 -4.850 1.25e-06 ***
## I(1/dist_dc^2) -4.965e-04 2.597e-04 -1.912 0.0559 .
## monthly_oil_2km -8.034e-07 4.776e-06 -0.168 0.8664
## active_2km 1.955e-02 2.804e-03 6.972 3.27e-12 ***
## inactive_2km -6.689e-03 6.590e-03 -1.015 0.3101
## elevation -4.141e-03 3.076e-03 -1.346 0.1782
## EVI -2.488e+00 1.069e-01 -23.269 < 2e-16 ***
## num_odor_complaints 4.609e-03 1.159e-03 3.976 7.06e-05 ***
## daily_downwind_ref 6.259e-03 1.576e-02 0.397 0.6912
## daily_temp 7.312e-03 1.358e-03 5.386 7.33e-08 ***
## daily_hum -1.082e-02 3.781e-04 -28.618 < 2e-16 ***
## daily_precip -1.219e-01 2.869e-02 -4.246 2.19e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 5 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 103.12
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.903 80 99.03
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/129
## R-sq.(adj) = 0.567 Deviance explained = 57.1%
## GCV = 0.2318 Scale est. = 0.2299 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(log_da_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + ws_avg +
## I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.546e-01 8.667e-01 0.986 0.324142
## month02 -2.977e-01 2.700e+00 -0.110 0.912190
## month03 -2.729e-01 3.401e+00 -0.080 0.936039
## month04 -1.507e-01 2.780e+00 -0.054 0.956769
## month05 -7.388e-02 2.245e+00 -0.033 0.973747
## month06 -8.268e-02 2.464e+00 -0.034 0.973229
## month07 -8.274e-03 2.461e+00 -0.003 0.997318
## month08 9.803e-02 2.240e+00 0.044 0.965092
## month09 1.840e-02 2.779e+00 0.007 0.994717
## month10 -9.720e-02 3.401e+00 -0.029 0.977198
## month11 -2.626e-01 2.700e+00 -0.097 0.922510
## month12 -1.975e-01 2.854e-02 -6.921 4.67e-12 ***
## weekdayMon 1.076e-01 1.642e-02 6.551 5.90e-11 ***
## weekdayTue 1.493e-01 1.637e-02 9.122 < 2e-16 ***
## weekdayWed 1.984e-01 1.635e-02 12.129 < 2e-16 ***
## weekdayThu 1.793e-01 1.637e-02 10.951 < 2e-16 ***
## weekdayFri 1.622e-01 1.634e-02 9.923 < 2e-16 ***
## weekdaySat 8.083e-02 1.637e-02 4.937 8.03e-07 ***
## ws_avg -4.208e-02 2.366e-03 -17.785 < 2e-16 ***
## I(1/dist_dc^2) 1.637e-03 2.719e-04 6.020 1.78e-09 ***
## monthly_oil_2km -1.693e-05 4.941e-06 -3.427 0.000612 ***
## active_2km 1.825e-02 2.987e-03 6.111 1.01e-09 ***
## inactive_2km 7.144e-03 6.860e-03 1.041 0.297675
## elevation -2.010e-02 3.376e-03 -5.953 2.68e-09 ***
## EVI -2.494e+00 1.138e-01 -21.925 < 2e-16 ***
## num_odor_complaints 1.837e-02 7.114e-04 25.821 < 2e-16 ***
## daily_downwind_ref -4.329e-02 1.681e-02 -2.576 0.010003 *
## daily_temp -3.715e-03 1.425e-03 -2.608 0.009120 **
## daily_hum -1.142e-02 3.860e-04 -29.574 < 2e-16 ***
## daily_precip -1.313e-01 2.726e-02 -4.816 1.48e-06 ***
## disaster 6.501e-01 4.012e-02 16.204 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 4.000 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.521 8.832 45.76
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.456 80.000 88.00
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 125/128
## R-sq.(adj) = 0.551 Deviance explained = 55.4%
## GCV = 0.29873 Scale est. = 0.29647 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(log_da_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + ws_avg +
## I(1/dist_dc^2) + monthly_oil_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.304e+00 8.735e-01 1.493 0.13538
## month02 -2.819e-01 2.722e+00 -0.104 0.91753
## month03 -2.357e-01 3.429e+00 -0.069 0.94522
## month04 -1.066e-01 2.803e+00 -0.038 0.96967
## month05 -3.566e-02 2.264e+00 -0.016 0.98743
## month06 -5.742e-02 2.484e+00 -0.023 0.98156
## month07 5.528e-03 2.481e+00 0.002 0.99822
## month08 1.016e-01 2.259e+00 0.045 0.96412
## month09 1.082e-02 2.802e+00 0.004 0.99692
## month10 1.226e-01 3.429e+00 0.036 0.97149
## month11 -4.326e-02 2.722e+00 -0.016 0.98732
## month12 3.138e-02 2.502e-02 1.254 0.20978
## weekdayMon 1.074e-01 1.656e-02 6.485 9.13e-11 ***
## weekdayTue 1.492e-01 1.651e-02 9.040 < 2e-16 ***
## weekdayWed 1.983e-01 1.649e-02 12.026 < 2e-16 ***
## weekdayThu 1.775e-01 1.651e-02 10.753 < 2e-16 ***
## weekdayFri 1.608e-01 1.648e-02 9.758 < 2e-16 ***
## weekdaySat 8.007e-02 1.651e-02 4.850 1.25e-06 ***
## ws_avg -4.196e-02 2.386e-03 -17.584 < 2e-16 ***
## I(1/dist_dc^2) 1.396e-03 3.149e-04 4.432 9.40e-06 ***
## monthly_oil_2km -1.530e-05 4.984e-06 -3.070 0.00215 **
## active_2km 1.544e-02 3.015e-03 5.120 3.09e-07 ***
## inactive_2km 1.108e-02 6.949e-03 1.594 0.11096
## elevation -1.903e-02 3.406e-03 -5.588 2.33e-08 ***
## EVI -2.425e+00 1.148e-01 -21.121 < 2e-16 ***
## num_odor_complaints 1.905e-02 7.161e-04 26.600 < 2e-16 ***
## daily_downwind_ref -4.994e-02 1.694e-02 -2.948 0.00320 **
## daily_temp -3.731e-03 1.436e-03 -2.597 0.00940 **
## daily_hum -1.175e-02 3.887e-04 -30.220 < 2e-16 ***
## daily_precip -1.255e-01 2.749e-02 -4.567 4.99e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 4.000 0.001
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.442 8.824 44.510
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.711 80.000 83.880
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/127
## R-sq.(adj) = 0.544 Deviance explained = 54.7%
## GCV = 0.30374 Scale est. = 0.30146 n = 15595
Since February 2022
# Since feb 2022
summary(dm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + monthly_oil_2km +
## active_2km + num_odor_complaints + daily_downwind_wrp + daily_temp +
## daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.417e+00 1.173e+00 2.914 0.003580 **
## month02 -5.408e-01 4.326e-01 -1.250 0.211292
## month03 -6.875e-01 6.110e-01 -1.125 0.260519
## month04 -1.207e+00 7.272e-01 -1.660 0.097032 .
## month05 -1.321e+00 7.623e-01 -1.733 0.083133 .
## month06 -1.608e+00 7.671e-01 -2.097 0.036063 *
## month07 -1.761e+00 7.344e-01 -2.398 0.016534 *
## month08 -1.645e+00 6.579e-01 -2.500 0.012439 *
## month09 -1.875e+00 5.600e-01 -3.348 0.000820 ***
## month10 -1.707e+00 5.926e-01 -2.881 0.003980 **
## month11 -9.262e-01 5.106e-01 -1.814 0.069751 .
## month12 1.781e-01 3.551e-01 0.502 0.616032
## weekdayMon 5.553e-01 1.557e-01 3.567 0.000364 ***
## weekdayTue 9.128e-01 1.555e-01 5.872 4.52e-09 ***
## weekdayWed 8.820e-01 1.558e-01 5.663 1.55e-08 ***
## weekdayThu 7.505e-01 1.559e-01 4.814 1.52e-06 ***
## weekdayFri 8.598e-01 1.555e-01 5.528 3.36e-08 ***
## weekdaySat 3.445e-01 1.554e-01 2.216 0.026697 *
## wd_avg 2.017e-03 5.651e-04 3.570 0.000360 ***
## ws_avg -2.202e-01 3.005e-02 -7.326 2.65e-13 ***
## I(1/dist_wrp^2) -2.092e-05 2.595e-06 -8.060 9.02e-16 ***
## I(1/dist_ref^2) 3.425e-04 4.276e-05 8.009 1.37e-15 ***
## monthly_oil_2km 9.761e-05 2.899e-05 3.368 0.000763 ***
## active_2km -6.509e-02 1.055e-02 -6.172 7.17e-10 ***
## num_odor_complaints 1.365e-01 1.941e-02 7.035 2.20e-12 ***
## daily_downwind_wrp 4.426e-01 1.850e-01 2.392 0.016778 *
## daily_temp 4.927e-02 1.537e-02 3.205 0.001358 **
## daily_hum -3.440e-02 4.376e-03 -7.861 4.43e-15 ***
## daily_precip 4.083e-01 2.438e-01 1.675 0.093988 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 5.711e-14 1.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.840e+00 8.944 14.696
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 4.944e+01 80.000 1.542
## p-value
## s(as.numeric(month)) 0.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 124/126
## R-sq.(adj) = 0.209 Deviance explained = 21.9%
## GCV = 11.299 Scale est. = 11.151 n = 6531
Disaster Only
# Disaster only
summary(dm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + inactive_2km + num_odor_complaints + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.849e+02 9.013e+01 3.161 0.00161 **
## wd_avg -5.063e-01 1.611e-01 -3.143 0.00172 **
## ws_avg 4.253e+01 1.327e+01 3.206 0.00138 **
## I(1/dist_wrp^2) -9.204e-04 4.076e-04 -2.258 0.02413 *
## I(1/dist_ref^2) 1.050e-02 4.707e-03 2.231 0.02589 *
## I(1/dist_dc^2) 2.830e+00 1.205e+00 2.350 0.01895 *
## monthly_oil_2km -3.934e-02 1.377e-02 -2.858 0.00434 **
## inactive_2km 3.257e+01 1.364e+01 2.388 0.01708 *
## num_odor_complaints -8.312e+00 1.257e+00 -6.614 5.65e-11 ***
## daily_hum -1.896e+00 8.990e-01 -2.109 0.03519 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.359 8.855 3.374
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.202 80.000 8.513
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 0.000459 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 96/99
## R-sq.(adj) = 0.432 Deviance explained = 46.9%
## GCV = 2.8585e+05 Scale est. = 2.6709e+05 n = 1273
Exclude Disaster
# Exclude disaster
summary(dm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + EVI + num_odor_complaints +
## daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.032e+00 1.477e+00 2.730 0.00634 **
## month02 -5.824e-01 7.727e-01 -0.754 0.45098
## month03 -6.730e-01 1.368e+00 -0.492 0.62277
## month04 -7.405e-01 1.823e+00 -0.406 0.68454
## month05 -1.445e+00 2.128e+00 -0.679 0.49712
## month06 -1.849e+00 2.283e+00 -0.810 0.41811
## month07 -1.727e+00 2.285e+00 -0.756 0.44978
## month08 -1.496e+00 2.134e+00 -0.701 0.48333
## month09 -1.402e+00 1.832e+00 -0.765 0.44405
## month10 -1.417e+00 1.388e+00 -1.021 0.30729
## month11 -6.786e-01 7.943e-01 -0.854 0.39292
## month12 -1.517e-01 2.287e-01 -0.663 0.50709
## weekdayMon 5.627e-01 1.318e-01 4.269 1.97e-05 ***
## weekdayTue 9.583e-01 1.313e-01 7.301 3.01e-13 ***
## weekdayWed 8.572e-01 1.312e-01 6.532 6.70e-11 ***
## weekdayThu 7.778e-01 1.313e-01 5.925 3.20e-09 ***
## weekdayFri 7.427e-01 1.313e-01 5.656 1.58e-08 ***
## weekdaySat 3.145e-01 1.314e-01 2.393 0.01672 *
## wd_avg 1.193e-03 4.528e-04 2.635 0.00843 **
## ws_avg -1.040e-01 1.795e-02 -5.797 6.91e-09 ***
## I(1/dist_wrp^2) 3.984e-06 6.889e-07 5.783 7.50e-09 ***
## I(1/dist_ref^2) 2.288e-05 2.695e-06 8.492 < 2e-16 ***
## EVI -3.015e+00 3.761e-01 -8.015 1.18e-15 ***
## num_odor_complaints 7.792e-02 1.001e-02 7.787 7.35e-15 ***
## daily_temp 2.929e-02 1.183e-02 2.476 0.01329 *
## daily_hum -3.160e-02 3.260e-03 -9.695 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 2.367e-12 2 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 13.289
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 6.218e+01 80 4.227
## p-value
## s(as.numeric(month)) 1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.123 Deviance explained = 12.9%
## GCV = 17.654 Scale est. = 17.536 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(dm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + wd_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.825e+02 1.032e+03 -0.661 0.508591
## month02 -1.581e+00 6.710e+02 -0.002 0.998120
## month03 -4.092e+00 1.209e+03 -0.003 0.997299
## month04 -8.671e+00 1.614e+03 -0.005 0.995713
## month05 -1.180e+01 1.885e+03 -0.006 0.995005
## month06 -8.519e+00 2.021e+03 -0.004 0.996637
## month07 -8.552e+00 2.021e+03 -0.004 0.996624
## month08 -5.798e+00 1.885e+03 -0.003 0.997546
## month09 2.304e-01 1.614e+03 0.000 0.999886
## month10 3.085e+01 1.209e+03 0.026 0.979641
## month11 -2.810e+01 6.710e+02 -0.042 0.966598
## month12 -2.317e+01 9.594e+00 -2.415 0.015760 *
## wd_avg -8.180e-02 1.866e-02 -4.383 1.18e-05 ***
## I(1/dist_wrp^2) -1.880e-04 9.013e-05 -2.086 0.037013 *
## I(1/dist_ref^2) -3.224e-03 5.131e-03 -0.628 0.529805
## I(1/dist_dc^2) 6.701e-01 4.085e-01 1.641 0.100914
## monthly_oil_2km 1.015e-03 1.958e-03 0.518 0.604164
## active_2km 5.052e+00 8.734e-01 5.785 7.40e-09 ***
## elevation -5.216e+00 1.018e+00 -5.125 3.02e-07 ***
## EVI -2.285e+02 4.502e+01 -5.075 3.93e-07 ***
## num_odor_complaints 5.415e+00 2.403e-01 22.536 < 2e-16 ***
## closest_wrp_capacity 1.695e+00 2.683e-01 6.319 2.71e-10 ***
## daily_downwind_ref -2.562e+01 5.710e+00 -4.488 7.25e-06 ***
## daily_downwind_wrp 8.202e+00 6.171e+00 1.329 0.183811
## daily_hum -4.493e-01 1.171e-01 -3.837 0.000125 ***
## disaster 6.676e+01 1.356e+01 4.922 8.68e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.192e-09 2.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.895e+00 8.972 6.560
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.435e+01 80.000 8.876
## p-value
## s(as.numeric(month)) 5.74e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 120/123
## R-sq.(adj) = 0.124 Deviance explained = 12.9%
## GCV = 34221 Scale est. = 33988 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(dm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_downwind_wrp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.413e+02 1.033e+03 -0.621 0.5348
## month02 -3.419e+00 6.714e+02 -0.005 0.9959
## month03 -6.472e+00 1.210e+03 -0.005 0.9957
## month04 -1.131e+01 1.615e+03 -0.007 0.9944
## month05 -1.446e+01 1.886e+03 -0.008 0.9939
## month06 -1.036e+01 2.023e+03 -0.005 0.9959
## month07 -8.594e+00 2.023e+03 -0.004 0.9966
## month08 -4.102e+00 1.886e+03 -0.002 0.9983
## month09 3.124e+00 1.615e+03 0.002 0.9985
## month10 5.769e+01 1.210e+03 0.048 0.9620
## month11 -2.498e+00 6.714e+02 -0.004 0.9970
## month12 8.355e-01 8.333e+00 0.100 0.9201
## wd_avg -8.470e-02 1.873e-02 -4.522 6.16e-06 ***
## ws_avg 1.233e+00 7.795e-01 1.582 0.1136
## I(1/dist_wrp^2) -1.716e-04 8.833e-05 -1.943 0.0521 .
## I(1/dist_ref^2) -3.335e-03 5.091e-03 -0.655 0.5125
## I(1/dist_dc^2) 6.105e-01 3.967e-01 1.539 0.1239
## monthly_oil_2km 1.254e-03 1.948e-03 0.643 0.5200
## active_2km 4.873e+00 8.721e-01 5.588 2.34e-08 ***
## elevation -4.980e+00 1.019e+00 -4.889 1.02e-06 ***
## EVI -2.185e+02 4.500e+01 -4.855 1.22e-06 ***
## num_odor_complaints 5.488e+00 2.400e-01 22.869 < 2e-16 ***
## closest_wrp_capacity 1.690e+00 2.669e-01 6.334 2.45e-10 ***
## daily_downwind_ref -2.643e+01 5.712e+00 -4.627 3.74e-06 ***
## daily_downwind_wrp 8.226e+00 6.176e+00 1.332 0.1829
## daily_hum -4.765e-01 1.170e-01 -4.072 4.69e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.775e-10 3.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.881e+00 8.968 6.677
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.403e+01 80.000 8.547
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 120/123
## R-sq.(adj) = 0.122 Deviance explained = 12.8%
## GCV = 34269 Scale est. = 34036 n = 15595
Since February 2022
# Since feb 2022
summary(log_dm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_gas_2km +
## elevation + EVI + num_odor_complaints + daily_temp + daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.337e+00 7.403e+00 0.316 0.752263
## month02 -4.496e-02 2.231e+01 -0.002 0.998392
## month03 -8.551e-03 2.810e+01 0.000 0.999757
## month04 8.563e-02 2.306e+01 0.004 0.997038
## month05 2.317e-01 1.879e+01 0.012 0.990161
## month06 2.818e-01 2.052e+01 0.014 0.989045
## month07 2.890e-01 2.034e+01 0.014 0.988662
## month08 2.375e-01 1.847e+01 0.013 0.989741
## month09 3.436e-02 2.301e+01 0.001 0.998808
## month10 -3.449e-01 2.815e+01 -0.012 0.990225
## month11 -2.837e-01 2.233e+01 -0.013 0.989866
## month12 -1.854e-01 6.654e-02 -2.787 0.005343 **
## weekdayMon 2.598e-01 2.901e-02 8.955 < 2e-16 ***
## weekdayTue 3.497e-01 2.897e-02 12.073 < 2e-16 ***
## weekdayWed 3.293e-01 2.903e-02 11.341 < 2e-16 ***
## weekdayThu 2.469e-01 2.906e-02 8.496 < 2e-16 ***
## weekdayFri 2.610e-01 2.897e-02 9.008 < 2e-16 ***
## weekdaySat 1.755e-01 2.897e-02 6.060 1.44e-09 ***
## wd_avg 3.930e-04 1.055e-04 3.725 0.000197 ***
## ws_avg -1.300e-01 5.435e-03 -23.925 < 2e-16 ***
## I(1/dist_ref^2) 4.217e-05 3.018e-05 1.397 0.162377
## I(1/dist_dc^2) -1.807e-04 1.881e-04 -0.961 0.336623
## monthly_gas_2km 2.978e-05 1.857e-05 1.603 0.108887
## elevation -5.687e-02 8.935e-03 -6.365 2.08e-10 ***
## EVI -1.413e+00 1.276e-01 -11.074 < 2e-16 ***
## num_odor_complaints 2.857e-02 3.723e-03 7.674 1.91e-14 ***
## daily_temp 9.838e-03 2.830e-03 3.476 0.000512 ***
## daily_hum -1.386e-02 7.946e-04 -17.450 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.353 2 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 25.95
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.051 80 18.10
## p-value
## s(as.numeric(month)) 0.000698 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/125
## R-sq.(adj) = 0.544 Deviance explained = 55.2%
## GCV = 0.39419 Scale est. = 0.38731 n = 6531
Disaster Only
# Disaster only
summary(log_dm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km +
## inactive_2km + elevation + num_odor_complaints + daily_temp +
## daily_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.788e+00 8.089e-01 8.391 < 2e-16 ***
## month11 -1.956e-01 1.658e-01 -1.180 0.238194
## month12 -4.188e-01 1.844e-01 -2.271 0.023313 *
## weekdayMon 1.455e-01 8.201e-02 1.775 0.076195 .
## weekdayTue -8.291e-02 8.401e-02 -0.987 0.323859
## weekdayWed 4.878e-02 8.157e-02 0.598 0.549949
## weekdayThu 1.501e-01 8.180e-02 1.835 0.066792 .
## weekdayFri -2.089e-01 7.977e-02 -2.619 0.008934 **
## weekdaySat -1.335e-01 8.061e-02 -1.657 0.097819 .
## wd_avg 2.471e-04 2.384e-04 1.037 0.300113
## ws_avg -1.441e-01 2.116e-02 -6.809 1.56e-11 ***
## I(1/dist_wrp^2) -3.130e-06 7.624e-07 -4.105 4.31e-05 ***
## I(1/dist_ref^2) 5.212e-05 7.485e-06 6.964 5.48e-12 ***
## I(1/dist_dc^2) 1.662e-02 2.265e-03 7.340 3.96e-13 ***
## monthly_oil_2km 4.527e-04 6.189e-05 7.314 4.74e-13 ***
## monthly_gas_2km -2.848e-03 3.507e-04 -8.119 1.17e-15 ***
## inactive_2km 8.977e-02 2.344e-02 3.830 0.000135 ***
## elevation -2.525e-01 2.570e-02 -9.826 < 2e-16 ***
## num_odor_complaints 8.459e-03 1.720e-03 4.919 9.90e-07 ***
## daily_temp 7.311e-03 8.888e-03 0.823 0.410953
## daily_hum -6.435e-03 1.733e-03 -3.712 0.000215 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.938 8.991 15.488
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 56.931 80.000 7.708
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 107/110
## R-sq.(adj) = 0.725 Deviance explained = 74.3%
## GCV = 0.621 Scale est. = 0.58009 n = 1273
Exclude Disaster
# Exclude disaster
summary(log_dm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_ref^2) + monthly_gas_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + daily_downwind_wrp +
## daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.360e+00 4.563e+00 0.298 0.76561
## month02 -2.655e-01 2.805e+00 -0.095 0.92462
## month03 -3.708e-01 5.054e+00 -0.073 0.94152
## month04 -4.552e-01 6.747e+00 -0.067 0.94621
## month05 -5.671e-01 7.881e+00 -0.072 0.94264
## month06 -7.032e-01 8.451e+00 -0.083 0.93369
## month07 -6.054e-01 8.452e+00 -0.072 0.94290
## month08 -4.295e-01 7.881e+00 -0.054 0.95654
## month09 -3.728e-01 6.747e+00 -0.055 0.95593
## month10 -5.761e-01 5.054e+00 -0.114 0.90925
## month11 -2.056e-01 2.806e+00 -0.073 0.94159
## month12 -7.229e-02 3.654e-02 -1.978 0.04793 *
## weekdayMon 2.614e-01 2.074e-02 12.599 < 2e-16 ***
## weekdayTue 3.066e-01 2.066e-02 14.842 < 2e-16 ***
## weekdayWed 3.344e-01 2.065e-02 16.196 < 2e-16 ***
## weekdayThu 2.971e-01 2.066e-02 14.378 < 2e-16 ***
## weekdayFri 2.896e-01 2.066e-02 14.014 < 2e-16 ***
## weekdaySat 1.537e-01 2.068e-02 7.433 1.12e-13 ***
## wd_avg 1.886e-04 7.179e-05 2.628 0.00860 **
## ws_avg -5.397e-02 2.928e-03 -18.434 < 2e-16 ***
## I(1/dist_ref^2) 1.677e-05 1.440e-05 1.164 0.24431
## monthly_gas_2km 5.280e-06 1.517e-05 0.348 0.72775
## active_2km 2.310e-02 3.522e-03 6.558 5.63e-11 ***
## inactive_2km -2.618e-02 8.261e-03 -3.169 0.00153 **
## elevation -1.969e-02 4.091e-03 -4.814 1.50e-06 ***
## EVI -2.136e+00 1.228e-01 -17.390 < 2e-16 ***
## num_odor_complaints 1.234e-02 1.591e-03 7.754 9.51e-15 ***
## daily_downwind_wrp 4.891e-02 2.298e-02 2.128 0.03334 *
## daily_temp 1.481e-02 1.873e-03 7.905 2.86e-15 ***
## daily_hum -1.259e-02 5.221e-04 -24.114 < 2e-16 ***
## daily_precip -1.038e-01 3.950e-02 -2.628 0.00860 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 2.474e-10 1 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 34.32
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.909e+01 80 42.29
## p-value
## s(as.numeric(month)) 0.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 127/128
## R-sq.(adj) = 0.481 Deviance explained = 48.5%
## GCV = 0.43774 Scale est. = 0.43413 n = 14322
Everything w Disaster Indicator
# Disaster indicator
summary(log_dm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + ws_avg +
## I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + daily_downwind_ref +
## daily_temp + daily_hum + daily_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.689e+00 3.416e-01 10.802 < 2e-16 ***
## month02 -3.025e-01 1.595e-01 -1.897 0.05782 .
## month03 -5.292e-01 2.840e-01 -1.863 0.06244 .
## month04 -6.785e-01 3.787e-01 -1.792 0.07318 .
## month05 -8.450e-01 4.419e-01 -1.912 0.05589 .
## month06 -9.548e-01 4.735e-01 -2.016 0.04379 *
## month07 -7.869e-01 4.732e-01 -1.663 0.09631 .
## month08 -5.000e-01 4.413e-01 -1.133 0.25717
## month09 -3.404e-01 3.783e-01 -0.900 0.36824
## month10 -1.182e-01 2.853e-01 -0.414 0.67881
## month11 -3.480e-01 1.618e-01 -2.151 0.03146 *
## month12 -2.763e-01 3.767e-02 -7.333 2.36e-13 ***
## weekdayMon 2.426e-01 2.171e-02 11.177 < 2e-16 ***
## weekdayTue 2.676e-01 2.164e-02 12.369 < 2e-16 ***
## weekdayWed 3.125e-01 2.162e-02 14.452 < 2e-16 ***
## weekdayThu 2.805e-01 2.164e-02 12.960 < 2e-16 ***
## weekdayFri 2.422e-01 2.160e-02 11.209 < 2e-16 ***
## weekdaySat 1.217e-01 2.165e-02 5.622 1.92e-08 ***
## ws_avg -5.479e-02 3.124e-03 -17.537 < 2e-16 ***
## I(1/dist_wrp^2) -4.816e-01 2.939e-02 -16.388 < 2e-16 ***
## I(1/dist_ref^2) -1.948e-03 1.209e-04 -16.110 < 2e-16 ***
## I(1/dist_dc^2) 4.262e+03 2.600e+02 16.388 < 2e-16 ***
## elevation -2.551e-02 3.838e-03 -6.646 3.11e-11 ***
## EVI -1.392e+00 7.331e-02 -18.995 < 2e-16 ***
## num_odor_complaints 2.453e-02 9.400e-04 26.092 < 2e-16 ***
## closest_wrp_capacity -3.532e-03 3.517e-04 -10.044 < 2e-16 ***
## daily_downwind_ref 1.927e-02 2.231e-02 0.864 0.38773
## daily_temp -6.874e-04 1.882e-03 -0.365 0.71492
## daily_hum -1.322e-02 5.104e-04 -25.913 < 2e-16 ***
## daily_precip -9.862e-02 3.603e-02 -2.737 0.00621 **
## disaster 9.166e-01 5.299e-02 17.299 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -2.453e-12 1.000 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.524e+00 8.844 32.95
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.876e+01 80.000 41.70
## p-value
## s(as.numeric(month)) 0.5
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 126/128
## R-sq.(adj) = 0.503 Deviance explained = 50.7%
## GCV = 0.52205 Scale est. = 0.51816 n = 15595
Everything w.o Disaster Indicator
# Everything
summary(log_dm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_daily_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## daily_downwind_ref + daily_temp + daily_hum + daily_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.256e+00 7.479e-01 5.691 1.29e-08 ***
## month02 -3.219e-01 4.639e-01 -0.694 0.4877
## month03 -5.495e-01 8.346e-01 -0.658 0.5103
## month04 -7.015e-01 1.114e+00 -0.630 0.5289
## month05 -8.651e-01 1.301e+00 -0.665 0.5061
## month06 -9.659e-01 1.395e+00 -0.692 0.4887
## month07 -7.788e-01 1.395e+00 -0.558 0.5766
## month08 -4.726e-01 1.301e+00 -0.363 0.7164
## month09 -3.053e-01 1.114e+00 -0.274 0.7840
## month10 2.407e-01 8.347e-01 0.288 0.7731
## month11 -4.731e-03 4.642e-01 -0.010 0.9919
## month12 4.651e-02 3.300e-02 1.410 0.1587
## weekdayMon 2.431e-01 2.192e-02 11.091 < 2e-16 ***
## weekdayTue 2.674e-01 2.185e-02 12.242 < 2e-16 ***
## weekdayWed 3.125e-01 2.183e-02 14.315 < 2e-16 ***
## weekdayThu 2.780e-01 2.185e-02 12.722 < 2e-16 ***
## weekdayFri 2.403e-01 2.181e-02 11.020 < 2e-16 ***
## weekdaySat 1.204e-01 2.185e-02 5.511 3.62e-08 ***
## wd_avg 9.950e-05 7.450e-05 1.336 0.1817
## ws_avg -5.504e-02 3.168e-03 -17.378 < 2e-16 ***
## I(1/dist_wrp^2) -4.959e-01 2.969e-02 -16.701 < 2e-16 ***
## I(1/dist_ref^2) -1.930e-02 1.158e-03 -16.670 < 2e-16 ***
## I(1/dist_dc^2) 4.381e+03 2.623e+02 16.701 < 2e-16 ***
## elevation -2.365e-02 3.871e-03 -6.110 1.02e-09 ***
## EVI -1.387e+00 7.389e-02 -18.778 < 2e-16 ***
## num_odor_complaints 2.549e-02 9.471e-04 26.910 < 2e-16 ***
## closest_wrp_capacity -3.447e-03 3.536e-04 -9.749 < 2e-16 ***
## daily_downwind_ref 9.237e-03 2.253e-02 0.410 0.6819
## daily_temp -5.642e-04 1.907e-03 -0.296 0.7673
## daily_hum -1.363e-02 5.172e-04 -26.357 < 2e-16 ***
## daily_precip -8.552e-02 3.653e-02 -2.341 0.0192 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -2.863e-11 2.000 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.394e+00 8.791 32.50
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.867e+01 80.000 37.32
## p-value
## s(as.numeric(month)) 1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 126/128
## R-sq.(adj) = 0.494 Deviance explained = 49.7%
## GCV = 0.53207 Scale est. = 0.52811 n = 15595
Since February 2022
# Since feb 2022
summary(ha_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.428e+00 2.291e-01 23.689 < 2e-16 ***
## month02 7.579e-02 1.071e+00 0.071 0.943578
## month03 3.688e-01 1.684e+00 0.219 0.826667
## month04 6.329e-01 1.755e+00 0.361 0.718297
## month05 6.251e-01 1.339e+00 0.467 0.640672
## month06 5.330e-01 6.819e-01 0.782 0.434435
## month07 3.795e-01 7.165e-01 0.530 0.596307
## month08 2.974e-01 1.381e+00 0.215 0.829538
## month09 3.008e-01 1.786e+00 0.168 0.866287
## month10 1.082e-01 1.702e+00 0.064 0.949302
## month11 -9.159e-02 1.076e+00 -0.085 0.932159
## month12 -1.805e-01 1.268e-02 -14.231 < 2e-16 ***
## weekdayMon 7.595e-02 5.570e-03 13.635 < 2e-16 ***
## weekdayTue 1.404e-01 5.521e-03 25.429 < 2e-16 ***
## weekdayWed 1.480e-01 5.534e-03 26.752 < 2e-16 ***
## weekdayThu 1.075e-01 5.537e-03 19.412 < 2e-16 ***
## weekdayFri 1.252e-01 5.517e-03 22.701 < 2e-16 ***
## weekdaySat 6.518e-02 5.502e-03 11.847 < 2e-16 ***
## wd_avg -2.063e-04 1.769e-05 -11.657 < 2e-16 ***
## ws_avg -5.071e-02 6.893e-04 -73.560 < 2e-16 ***
## I(1/dist_wrp^2) 1.093e-06 6.567e-08 16.640 < 2e-16 ***
## I(1/dist_ref^2) 8.588e-06 8.110e-07 10.589 < 2e-16 ***
## I(1/dist_dc^2) -4.173e-04 1.314e-05 -31.751 < 2e-16 ***
## monthly_oil_2km 7.987e-06 2.177e-06 3.668 0.000244 ***
## monthly_gas_2km -3.974e-05 6.338e-06 -6.270 3.61e-10 ***
## active_2km 1.475e-02 1.744e-03 8.456 < 2e-16 ***
## inactive_2km -3.677e-02 4.986e-03 -7.375 1.65e-13 ***
## elevation -3.057e-02 3.671e-03 -8.329 < 2e-16 ***
## EVI -1.467e+00 8.117e-02 -18.072 < 2e-16 ***
## num_odor_complaints 1.398e-02 7.215e-04 19.381 < 2e-16 ***
## closest_wrp_capacity -3.027e-03 4.255e-04 -7.114 1.13e-12 ***
## hourly_downwind_ref -2.855e-02 5.411e-03 -5.277 1.32e-07 ***
## hourly_downwind_wrp 1.910e-02 6.253e-03 3.054 0.002258 **
## hourly_temp -3.228e-02 3.322e-04 -97.181 < 2e-16 ***
## hourly_hum -9.569e-03 1.181e-04 -81.050 < 2e-16 ***
## hourly_precip 4.285e-01 1.009e-01 4.249 2.15e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 0.8031 2.000 0.013
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.2526 8.253 148.921
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.9926 80.000 394.272
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 128/133
## R-sq.(adj) = 0.352 Deviance explained = 35.3%
## GCV = 0.33353 Scale est. = 0.33327 n = 153718
Disaster Only
# Disaster only
summary(ha_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month +
## weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) +
## I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_wrp + hourly_temp + hourly_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.439e+02 2.504e+02 1.373 0.169667
## month11 -5.068e+00 5.033e+00 -1.007 0.313944
## month12 -1.720e+00 5.452e+00 -0.316 0.752358
## weekdayMon -3.528e+00 2.586e+00 -1.364 0.172618
## weekdayTue -6.387e+00 2.595e+00 -2.461 0.013857 *
## weekdayWed -1.574e+00 2.562e+00 -0.615 0.538825
## weekdayThu 1.794e+00 2.558e+00 0.701 0.483077
## weekdayFri 2.918e+00 2.502e+00 1.166 0.243428
## weekdaySat 2.057e+00 2.531e+00 0.813 0.416351
## wd_avg -2.871e-02 6.943e-03 -4.135 3.56e-05 ***
## ws_avg 1.454e+00 3.702e-01 3.929 8.55e-05 ***
## I(1/dist_wrp^2) -6.655e-04 1.696e-04 -3.925 8.71e-05 ***
## I(1/dist_ref^2) -1.667e-03 2.781e-03 -0.599 0.548960
## I(1/dist_dc^2) 5.893e+00 1.665e+00 3.539 0.000402 ***
## monthly_oil_2km -1.799e-02 8.448e-03 -2.130 0.033219 *
## monthly_gas_2km -2.634e-02 2.249e-02 -1.172 0.241372
## active_2km -1.138e+01 1.964e+00 -5.795 6.91e-09 ***
## inactive_2km 7.207e+01 6.829e+00 10.554 < 2e-16 ***
## elevation -3.683e+01 1.824e+00 -20.188 < 2e-16 ***
## EVI -6.007e+02 4.246e+01 -14.146 < 2e-16 ***
## num_odor_complaints -1.253e+00 6.419e-02 -19.512 < 2e-16 ***
## closest_wrp_capacity 1.175e+00 4.648e-01 2.527 0.011497 *
## hourly_downwind_wrp 7.409e+00 2.649e+00 2.797 0.005164 **
## hourly_temp -8.394e-01 1.383e-01 -6.071 1.29e-09 ***
## hourly_hum -1.086e-01 4.156e-02 -2.613 0.008980 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.707 8.932 83.18
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.390 80.000 63.25
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.21 Deviance explained = 21.3%
## GCV = 13802 Scale est. = 13753 n = 30242
Exclude Disaster
# Exclude disaster
summary(ha_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_gas_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.171e+00 1.228e-01 42.102 < 2e-16 ***
## month02 -3.275e-01 1.861e-02 -17.599 < 2e-16 ***
## month03 -4.058e-01 3.220e-02 -12.602 < 2e-16 ***
## month04 -3.016e-01 4.275e-02 -7.055 1.74e-12 ***
## month05 -2.952e-01 4.988e-02 -5.919 3.25e-09 ***
## month06 -2.669e-01 5.353e-02 -4.987 6.14e-07 ***
## month07 -1.019e-01 5.360e-02 -1.902 0.057209 .
## month08 7.398e-02 5.016e-02 1.475 0.140227
## month09 1.094e-01 4.321e-02 2.531 0.011388 *
## month10 -2.459e-02 3.323e-02 -0.740 0.459290
## month11 -7.539e-02 1.997e-02 -3.775 0.000160 ***
## month12 5.687e-03 8.501e-03 0.669 0.503519
## weekdayMon 8.491e-02 4.871e-03 17.430 < 2e-16 ***
## weekdayTue 1.449e-01 4.831e-03 29.991 < 2e-16 ***
## weekdayWed 1.670e-01 4.827e-03 34.594 < 2e-16 ***
## weekdayThu 1.527e-01 4.826e-03 31.636 < 2e-16 ***
## weekdayFri 1.416e-01 4.827e-03 29.336 < 2e-16 ***
## weekdaySat 6.808e-02 4.818e-03 14.130 < 2e-16 ***
## wd_avg -3.939e-04 1.531e-05 -25.719 < 2e-16 ***
## ws_avg -3.267e-02 5.122e-04 -63.792 < 2e-16 ***
## I(1/dist_wrp^2) 5.650e-07 1.133e-07 4.988 6.12e-07 ***
## I(1/dist_ref^2) -1.923e-06 5.686e-06 -0.338 0.735164
## I(1/dist_dc^2) 3.696e-04 7.516e-05 4.917 8.78e-07 ***
## monthly_gas_2km -4.793e-05 3.706e-06 -12.935 < 2e-16 ***
## active_2km 7.070e-03 8.514e-04 8.303 < 2e-16 ***
## inactive_2km -1.998e-03 2.628e-03 -0.760 0.447140
## elevation -1.247e-02 1.107e-03 -11.261 < 2e-16 ***
## EVI -1.395e+00 4.691e-02 -29.745 < 2e-16 ***
## num_odor_complaints 5.820e-03 3.757e-04 15.493 < 2e-16 ***
## closest_wrp_capacity -2.193e-03 2.696e-04 -8.133 4.21e-16 ***
## hourly_downwind_ref -5.354e-02 4.630e-03 -11.566 < 2e-16 ***
## hourly_downwind_wrp 2.387e-02 5.121e-03 4.662 3.13e-06 ***
## hourly_temp -3.283e-02 2.799e-04 -117.273 < 2e-16 ***
## hourly_hum -9.138e-03 9.611e-05 -95.077 < 2e-16 ***
## hourly_precip -4.412e-01 1.143e-01 -3.859 0.000114 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -5.732e-12 1 0.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.989e+00 9 317.6
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.997e+01 80 470.4
## p-value
## s(as.numeric(month)) 1.59e-05 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) < 2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/132
## R-sq.(adj) = 0.246 Deviance explained = 24.6%
## GCV = 0.56167 Scale est. = 0.56147 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(ha_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.461e+02 4.269e+01 -3.422 0.000621 ***
## month02 -2.079e-01 2.771e+01 -0.008 0.994014
## month03 -5.634e-01 4.992e+01 -0.011 0.990995
## month04 -4.432e-01 6.664e+01 -0.007 0.994694
## month05 -6.434e-01 7.784e+01 -0.008 0.993406
## month06 5.551e-02 8.348e+01 0.001 0.999469
## month07 3.045e-01 8.348e+01 0.004 0.997090
## month08 9.068e-01 7.784e+01 0.012 0.990705
## month09 1.462e+00 6.664e+01 0.022 0.982495
## month10 4.600e+00 4.992e+01 0.092 0.926590
## month11 -3.497e+00 2.771e+01 -0.126 0.899592
## month12 -3.341e+00 3.976e-01 -8.402 < 2e-16 ***
## weekdayMon -6.853e-01 2.303e-01 -2.975 0.002927 **
## weekdayTue -6.205e-01 2.285e-01 -2.716 0.006615 **
## weekdayWed -3.120e-01 2.282e-01 -1.367 0.171625
## weekdayThu -8.171e-02 2.282e-01 -0.358 0.720303
## weekdayFri 2.022e-01 2.279e-01 0.887 0.375079
## weekdaySat 1.024e-01 2.278e-01 0.449 0.653151
## wd_avg -7.110e-03 7.129e-04 -9.973 < 2e-16 ***
## ws_avg 1.418e-01 2.452e-02 5.782 7.39e-09 ***
## I(1/dist_wrp^2) 4.580e-06 3.929e-06 1.166 0.243733
## I(1/dist_ref^2) -2.979e-03 4.154e-04 -7.171 7.46e-13 ***
## I(1/dist_dc^2) 2.296e-01 2.938e-02 7.816 5.48e-15 ***
## monthly_oil_2km 3.011e-04 8.786e-05 3.427 0.000611 ***
## monthly_gas_2km 6.389e-07 1.795e-04 0.004 0.997161
## active_2km 2.318e-01 4.148e-02 5.588 2.30e-08 ***
## inactive_2km 2.514e+00 1.166e-01 21.554 < 2e-16 ***
## elevation -1.335e+00 5.276e-02 -25.295 < 2e-16 ***
## EVI -5.344e+01 2.178e+00 -24.539 < 2e-16 ***
## num_odor_complaints 5.729e-01 1.002e-02 57.162 < 2e-16 ***
## closest_wrp_capacity 4.007e-01 1.447e-02 27.684 < 2e-16 ***
## hourly_downwind_ref -1.995e+00 2.153e-01 -9.268 < 2e-16 ***
## hourly_downwind_wrp 1.523e+00 2.415e-01 6.308 2.83e-10 ***
## hourly_temp -2.245e-01 1.276e-02 -17.591 < 2e-16 ***
## hourly_hum -5.432e-02 4.319e-03 -12.579 < 2e-16 ***
## hourly_precip -2.470e+00 4.969e+00 -0.497 0.619039
## disaster 7.970e+00 5.600e-01 14.231 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -5.76e-09 1 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00e+00 9 84.69
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.75e+01 80 79.15
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 131/134
## R-sq.(adj) = 0.0488 Deviance explained = 4.91%
## GCV = 1367.6 Scale est. = 1367.2 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(ha_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_avg ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.421e+02 4.271e+01 -3.328 0.000874 ***
## month02 -3.171e-01 2.772e+01 -0.011 0.990873
## month03 -6.998e-01 4.993e+01 -0.014 0.988819
## month04 -6.096e-01 6.666e+01 -0.009 0.992704
## month05 -8.162e-01 7.786e+01 -0.010 0.991637
## month06 -3.188e-02 8.350e+01 0.000 0.999695
## month07 3.831e-01 8.350e+01 0.005 0.996339
## month08 1.163e+00 7.786e+01 0.015 0.988083
## month09 1.786e+00 6.666e+01 0.027 0.978623
## month10 7.730e+00 4.994e+01 0.155 0.876973
## month11 -5.089e-01 2.772e+01 -0.018 0.985353
## month12 -5.331e-01 3.454e-01 -1.543 0.122736
## weekdayMon -6.847e-01 2.304e-01 -2.972 0.002958 **
## weekdayTue -6.158e-01 2.286e-01 -2.694 0.007052 **
## weekdayWed -3.131e-01 2.283e-01 -1.371 0.170285
## weekdayThu -9.952e-02 2.283e-01 -0.436 0.662819
## weekdayFri 1.851e-01 2.280e-01 0.812 0.416865
## weekdaySat 8.977e-02 2.279e-01 0.394 0.693623
## wd_avg -7.143e-03 7.131e-04 -10.016 < 2e-16 ***
## ws_avg 1.408e-01 2.453e-02 5.739 9.53e-09 ***
## I(1/dist_wrp^2) 5.879e-06 3.990e-06 1.473 0.140624
## I(1/dist_ref^2) -2.993e-03 4.165e-04 -7.186 6.70e-13 ***
## I(1/dist_dc^2) 2.254e-01 2.939e-02 7.667 1.76e-14 ***
## monthly_oil_2km 3.395e-04 8.803e-05 3.857 0.000115 ***
## monthly_gas_2km 9.856e-06 1.814e-04 0.054 0.956678
## active_2km 2.019e-01 4.147e-02 4.869 1.12e-06 ***
## inactive_2km 2.564e+00 1.168e-01 21.962 < 2e-16 ***
## elevation -1.331e+00 5.280e-02 -25.207 < 2e-16 ***
## EVI -5.316e+01 2.179e+00 -24.396 < 2e-16 ***
## num_odor_complaints 5.810e-01 1.001e-02 58.048 < 2e-16 ***
## closest_wrp_capacity 4.048e-01 1.449e-02 27.939 < 2e-16 ***
## hourly_downwind_ref -2.041e+00 2.153e-01 -9.481 < 2e-16 ***
## hourly_downwind_wrp 1.559e+00 2.415e-01 6.453 1.10e-10 ***
## hourly_temp -2.276e-01 1.277e-02 -17.831 < 2e-16 ***
## hourly_hum -5.672e-02 4.317e-03 -13.140 < 2e-16 ***
## hourly_precip -1.979e+00 4.970e+00 -0.398 0.690535
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -1.049e-09 2 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 86.14
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.768e+01 80 76.89
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 130/133
## R-sq.(adj) = 0.0483 Deviance explained = 4.86%
## GCV = 1368.4 Scale est. = 1367.9 n = 367838
Since February 2022
# Since feb 2022
summary(log_ha_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.207e+00 7.312e-01 4.386 1.16e-05 ***
## month02 3.402e-02 4.379e-01 0.078 0.938072
## month03 2.133e-01 7.887e-01 0.270 0.786847
## month04 3.888e-01 1.053e+00 0.369 0.711880
## month05 2.624e-01 1.230e+00 0.213 0.830997
## month06 1.090e-01 1.319e+00 0.083 0.934136
## month07 -2.448e-02 1.319e+00 -0.019 0.985191
## month08 -3.735e-02 1.230e+00 -0.030 0.975770
## month09 8.082e-02 1.053e+00 0.077 0.938801
## month10 -7.892e-02 7.887e-01 -0.100 0.920293
## month11 -1.731e-01 4.380e-01 -0.395 0.692690
## month12 -2.351e-01 1.206e-02 -19.498 < 2e-16 ***
## weekdayMon 9.059e-02 5.295e-03 17.109 < 2e-16 ***
## weekdayTue 1.575e-01 5.248e-03 30.001 < 2e-16 ***
## weekdayWed 1.514e-01 5.261e-03 28.777 < 2e-16 ***
## weekdayThu 1.084e-01 5.264e-03 20.600 < 2e-16 ***
## weekdayFri 1.266e-01 5.245e-03 24.141 < 2e-16 ***
## weekdaySat 7.732e-02 5.230e-03 14.785 < 2e-16 ***
## wd_avg -3.798e-04 1.682e-05 -22.580 < 2e-16 ***
## ws_avg -6.208e-02 6.553e-04 -94.732 < 2e-16 ***
## I(1/dist_wrp^2) 1.276e-06 6.672e-08 19.120 < 2e-16 ***
## I(1/dist_ref^2) -1.255e-05 3.232e-06 -3.883 0.000103 ***
## I(1/dist_dc^2) -6.694e-04 2.379e-05 -28.136 < 2e-16 ***
## monthly_oil_2km 3.505e-07 2.070e-06 0.169 0.865544
## monthly_gas_2km -4.783e-05 6.025e-06 -7.939 2.05e-15 ***
## active_2km 1.795e-02 1.658e-03 10.825 < 2e-16 ***
## inactive_2km -6.403e-03 3.469e-03 -1.845 0.064979 .
## elevation -4.601e-02 2.711e-03 -16.971 < 2e-16 ***
## EVI -2.373e+00 6.234e-02 -38.067 < 2e-16 ***
## num_odor_complaints 1.437e-02 6.859e-04 20.948 < 2e-16 ***
## hourly_downwind_ref -7.808e-03 5.144e-03 -1.518 0.129064
## hourly_downwind_wrp 4.748e-02 5.945e-03 7.986 1.40e-15 ***
## hourly_temp -3.114e-02 3.158e-04 -98.625 < 2e-16 ***
## hourly_hum -9.724e-03 1.122e-04 -86.645 < 2e-16 ***
## hourly_precip 1.964e-01 9.588e-02 2.048 0.040524 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.153e-10 0 Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 341.1
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 8.000e+01 80 1154.9
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/132
## R-sq.(adj) = 0.548 Deviance explained = 54.8%
## GCV = 0.30141 Scale est. = 0.30117 n = 153718
Disaster Only
# Disaster only
summary(log_ha_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.338e+01 1.863e+00 -12.550 < 2e-16 ***
## month11 1.553e-01 3.505e-02 4.431 9.39e-06 ***
## month12 -1.110e-01 3.620e-02 -3.065 0.00218 **
## weekdayMon 2.516e-02 1.808e-02 1.392 0.16397
## weekdayTue -9.922e-02 1.814e-02 -5.470 4.54e-08 ***
## weekdayWed 1.126e-01 1.791e-02 6.291 3.20e-10 ***
## weekdayThu 9.509e-02 1.806e-02 5.265 1.41e-07 ***
## weekdayFri 3.090e-02 1.749e-02 1.767 0.07732 .
## weekdaySat -8.340e-02 1.769e-02 -4.714 2.44e-06 ***
## wd_avg -7.720e-04 4.893e-05 -15.778 < 2e-16 ***
## ws_avg -1.110e-01 2.666e-03 -41.647 < 2e-16 ***
## I(1/dist_wrp^2) -4.535e-05 3.437e-06 -13.194 < 2e-16 ***
## I(1/dist_ref^2) -2.659e-04 1.972e-05 -13.487 < 2e-16 ***
## I(1/dist_dc^2) 3.288e-01 2.425e-02 13.559 < 2e-16 ***
## monthly_oil_2km 6.284e-04 4.022e-05 15.627 < 2e-16 ***
## active_2km -3.871e-01 1.253e-02 -30.882 < 2e-16 ***
## inactive_2km 6.700e-01 4.309e-02 15.551 < 2e-16 ***
## elevation -2.089e-01 1.147e-02 -18.209 < 2e-16 ***
## num_odor_complaints 4.279e-03 4.510e-04 9.489 < 2e-16 ***
## closest_wrp_capacity 6.845e-02 3.546e-03 19.302 < 2e-16 ***
## hourly_downwind_ref -1.641e-01 1.466e-02 -11.197 < 2e-16 ***
## hourly_downwind_wrp 2.034e-01 1.867e-02 10.898 < 2e-16 ***
## hourly_temp -3.594e-02 9.671e-04 -37.166 < 2e-16 ***
## hourly_hum -1.034e-02 2.939e-04 -35.177 < 2e-16 ***
## hourly_precip 1.048e+00 2.535e-01 4.134 3.57e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 250.62
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.37 80 88.83
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.583 Deviance explained = 58.5%
## GCV = 0.67426 Scale est. = 0.6718 n = 30242
Exclude Disaster
# Exclude disaster
summary(log_ha_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.553e+00 1.233e-01 28.809 < 2e-16 ***
## month02 -3.315e-01 1.317e-02 -25.166 < 2e-16 ***
## month03 -4.046e-01 2.238e-02 -18.079 < 2e-16 ***
## month04 -2.607e-01 2.963e-02 -8.798 < 2e-16 ***
## month05 -1.949e-01 3.455e-02 -5.642 1.69e-08 ***
## month06 -1.509e-01 3.709e-02 -4.068 4.74e-05 ***
## month07 4.910e-02 3.716e-02 1.321 0.186406
## month08 2.784e-01 3.482e-02 7.994 1.31e-15 ***
## month09 3.001e-01 3.009e-02 9.973 < 2e-16 ***
## month10 3.470e-03 2.341e-02 0.148 0.882168
## month11 -8.855e-03 1.447e-02 -0.612 0.540524
## month12 6.263e-03 7.087e-03 0.884 0.376852
## weekdayMon 9.329e-02 4.061e-03 22.974 < 2e-16 ***
## weekdayTue 1.470e-01 4.027e-03 36.502 < 2e-16 ***
## weekdayWed 1.691e-01 4.023e-03 42.040 < 2e-16 ***
## weekdayThu 1.539e-01 4.023e-03 38.254 < 2e-16 ***
## weekdayFri 1.443e-01 4.024e-03 35.860 < 2e-16 ***
## weekdaySat 7.404e-02 4.016e-03 18.436 < 2e-16 ***
## wd_avg -6.137e-04 1.277e-05 -48.073 < 2e-16 ***
## ws_avg -3.843e-02 4.269e-04 -90.028 < 2e-16 ***
## I(1/dist_wrp^2) 2.574e-06 1.140e-07 22.583 < 2e-16 ***
## I(1/dist_ref^2) -1.092e-04 4.979e-06 -21.930 < 2e-16 ***
## I(1/dist_dc^2) 8.109e-04 1.821e-04 4.454 8.43e-06 ***
## monthly_oil_2km -4.300e-06 1.662e-06 -2.588 0.009662 **
## monthly_gas_2km -5.942e-05 3.248e-06 -18.295 < 2e-16 ***
## active_2km 1.841e-02 7.543e-04 24.403 < 2e-16 ***
## inactive_2km -7.903e-03 2.199e-03 -3.594 0.000326 ***
## elevation -6.017e-03 9.253e-04 -6.503 7.89e-11 ***
## EVI -2.287e+00 3.914e-02 -58.419 < 2e-16 ***
## num_odor_complaints 2.953e-03 3.132e-04 9.427 < 2e-16 ***
## closest_wrp_capacity -2.429e-03 2.687e-04 -9.041 < 2e-16 ***
## hourly_downwind_ref -3.675e-02 3.859e-03 -9.523 < 2e-16 ***
## hourly_downwind_wrp 6.508e-02 4.268e-03 15.246 < 2e-16 ***
## hourly_temp -3.036e-02 2.333e-04 -130.116 < 2e-16 ***
## hourly_hum -8.381e-03 8.012e-05 -104.604 < 2e-16 ***
## hourly_precip -7.615e-01 9.530e-02 -7.991 1.34e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 5.881e-12 0 Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.998e+00 9 1217
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01 80 1558
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 130/133
## R-sq.(adj) = 0.442 Deviance explained = 44.3%
## GCV = 0.39025 Scale est. = 0.3901 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(log_ha_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip +
## disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.077e+00 2.482e-01 4.338 1.44e-05 ***
## month02 -2.238e-01 6.918e-01 -0.324 0.746
## month03 -2.337e-01 8.715e-01 -0.268 0.789
## month04 -6.518e-02 7.124e-01 -0.091 0.927
## month05 -2.003e-03 5.753e-01 -0.003 0.997
## month06 -3.908e-03 6.314e-01 -0.006 0.995
## month07 1.030e-01 6.307e-01 0.163 0.870
## month08 2.540e-01 5.741e-01 0.442 0.658
## month09 2.208e-01 7.120e-01 0.310 0.756
## month10 2.476e-02 8.714e-01 0.028 0.977
## month11 -1.114e-01 6.918e-01 -0.161 0.872
## month12 -1.504e-01 7.296e-03 -20.616 < 2e-16 ***
## weekdayMon 8.104e-02 4.221e-03 19.200 < 2e-16 ***
## weekdayTue 1.197e-01 4.187e-03 28.589 < 2e-16 ***
## weekdayWed 1.606e-01 4.182e-03 38.411 < 2e-16 ***
## weekdayThu 1.438e-01 4.182e-03 34.380 < 2e-16 ***
## weekdayFri 1.314e-01 4.177e-03 31.452 < 2e-16 ***
## weekdaySat 6.026e-02 4.175e-03 14.435 < 2e-16 ***
## wd_avg -6.880e-04 1.306e-05 -52.657 < 2e-16 ***
## ws_avg -4.069e-02 4.494e-04 -90.536 < 2e-16 ***
## I(1/dist_wrp^2) 1.648e-06 1.034e-07 15.935 < 2e-16 ***
## I(1/dist_ref^2) -1.252e-04 4.855e-06 -25.789 < 2e-16 ***
## I(1/dist_dc^2) 6.981e-03 2.977e-04 23.451 < 2e-16 ***
## monthly_oil_2km -7.660e-06 1.570e-06 -4.880 1.06e-06 ***
## active_2km 2.180e-02 7.782e-04 28.007 < 2e-16 ***
## inactive_2km 2.259e-02 2.248e-03 10.047 < 2e-16 ***
## elevation -3.036e-02 9.705e-04 -31.285 < 2e-16 ***
## EVI -3.065e+00 3.985e-02 -76.922 < 2e-16 ***
## num_odor_complaints 1.222e-02 1.837e-04 66.535 < 2e-16 ***
## closest_wrp_capacity 3.715e-03 2.673e-04 13.898 < 2e-16 ***
## hourly_downwind_ref -6.755e-02 3.946e-03 -17.120 < 2e-16 ***
## hourly_downwind_wrp 8.569e-02 4.422e-03 19.377 < 2e-16 ***
## hourly_temp -3.332e-02 2.339e-04 -142.468 < 2e-16 ***
## hourly_hum -9.015e-03 7.914e-05 -113.911 < 2e-16 ***
## hourly_precip -9.368e-01 9.106e-02 -10.287 < 2e-16 ***
## disaster 4.800e-01 1.028e-02 46.710 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 6 0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.988 9 734.615
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.968 80 1476.695
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 128/133
## R-sq.(adj) = 0.438 Deviance explained = 43.8%
## GCV = 0.45927 Scale est. = 0.45912 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(log_ha_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_avg) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + active_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.279e+00 2.489e-01 5.140 2.75e-07 ***
## month02 -2.113e-01 6.939e-01 -0.305 0.76072
## month03 -2.096e-01 8.740e-01 -0.240 0.81052
## month04 -3.973e-02 7.145e-01 -0.056 0.95565
## month05 1.578e-02 5.770e-01 0.027 0.97818
## month06 3.355e-03 6.333e-01 0.005 0.99577
## month07 1.014e-01 6.326e-01 0.160 0.87263
## month08 2.469e-01 5.758e-01 0.429 0.66813
## month09 2.091e-01 7.142e-01 0.293 0.76965
## month10 1.841e-01 8.740e-01 0.211 0.83315
## month11 5.095e-02 6.939e-01 0.073 0.94146
## month12 1.905e-02 6.349e-03 3.000 0.00270 **
## weekdayMon 8.107e-02 4.233e-03 19.151 < 2e-16 ***
## weekdayTue 1.200e-01 4.200e-03 28.568 < 2e-16 ***
## weekdayWed 1.606e-01 4.195e-03 38.275 < 2e-16 ***
## weekdayThu 1.427e-01 4.194e-03 34.018 < 2e-16 ***
## weekdayFri 1.303e-01 4.189e-03 31.112 < 2e-16 ***
## weekdaySat 5.951e-02 4.187e-03 14.212 < 2e-16 ***
## wd_avg -6.895e-04 1.310e-05 -52.619 < 2e-16 ***
## ws_avg -4.077e-02 4.508e-04 -90.448 < 2e-16 ***
## I(1/dist_wrp^2) 1.883e-06 1.026e-07 18.344 < 2e-16 ***
## I(1/dist_ref^2) -1.327e-04 4.890e-06 -27.133 < 2e-16 ***
## I(1/dist_dc^2) 6.616e-03 2.917e-04 22.682 < 2e-16 ***
## monthly_oil_2km -5.366e-06 1.574e-06 -3.410 0.00065 ***
## active_2km 1.967e-02 7.795e-04 25.237 < 2e-16 ***
## inactive_2km 2.720e-02 2.255e-03 12.058 < 2e-16 ***
## elevation -3.014e-02 9.736e-04 -30.961 < 2e-16 ***
## EVI -3.047e+00 3.998e-02 -76.224 < 2e-16 ***
## num_odor_complaints 1.272e-02 1.839e-04 69.151 < 2e-16 ***
## closest_wrp_capacity 4.033e-03 2.682e-04 15.037 < 2e-16 ***
## hourly_downwind_ref -7.040e-02 3.957e-03 -17.793 < 2e-16 ***
## hourly_downwind_wrp 8.775e-02 4.435e-03 19.784 < 2e-16 ***
## hourly_temp -3.350e-02 2.346e-04 -142.832 < 2e-16 ***
## hourly_hum -9.160e-03 7.932e-05 -115.485 < 2e-16 ***
## hourly_precip -9.074e-01 9.133e-02 -9.935 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 6 0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.994 9 740.199
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.984 80 1445.150
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 127/132
## R-sq.(adj) = 0.435 Deviance explained = 43.5%
## GCV = 0.46199 Scale est. = 0.46184 n = 367838
Since February 2022
# Since feb 2022
summary(hm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + inactive_2km + elevation +
## EVI + num_odor_complaints + closest_wrp_capacity + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.792e+00 5.587e-01 12.158 < 2e-16 ***
## month02 4.507e-02 1.723e+00 0.026 0.979127
## month03 3.985e-01 2.172e+00 0.183 0.854442
## month04 6.758e-01 1.785e+00 0.379 0.705020
## month05 6.962e-01 1.450e+00 0.480 0.631039
## month06 6.309e-01 1.576e+00 0.400 0.688975
## month07 4.864e-01 1.563e+00 0.311 0.755711
## month08 4.418e-01 1.428e+00 0.309 0.756992
## month09 4.402e-01 1.780e+00 0.247 0.804686
## month10 1.809e-01 2.174e+00 0.083 0.933703
## month11 -6.956e-02 1.723e+00 -0.040 0.967804
## month12 -1.587e-01 2.603e-02 -6.098 1.08e-09 ***
## weekdayMon 1.238e-01 1.143e-02 10.831 < 2e-16 ***
## weekdayTue 2.128e-01 1.133e-02 18.777 < 2e-16 ***
## weekdayWed 2.199e-01 1.136e-02 19.365 < 2e-16 ***
## weekdayThu 1.682e-01 1.136e-02 14.799 < 2e-16 ***
## weekdayFri 1.990e-01 1.132e-02 17.578 < 2e-16 ***
## weekdaySat 9.236e-02 1.129e-02 8.180 2.86e-16 ***
## wd_avg -7.348e-05 3.631e-05 -2.024 0.043009 *
## ws_avg -6.778e-02 1.415e-03 -47.908 < 2e-16 ***
## I(1/dist_wrp^2) 8.233e-07 1.345e-07 6.122 9.26e-10 ***
## I(1/dist_ref^2) 2.110e-05 1.770e-06 11.926 < 2e-16 ***
## I(1/dist_dc^2) -4.611e-04 2.916e-05 -15.813 < 2e-16 ***
## monthly_oil_2km 1.332e-05 4.208e-06 3.166 0.001547 **
## monthly_gas_2km -5.650e-08 1.269e-05 -0.004 0.996449
## inactive_2km -2.039e-02 9.167e-03 -2.225 0.026105 *
## elevation -4.044e-02 7.753e-03 -5.217 1.82e-07 ***
## EVI -1.419e+00 1.512e-01 -9.382 < 2e-16 ***
## num_odor_complaints 3.459e-02 1.481e-03 23.359 < 2e-16 ***
## closest_wrp_capacity -4.057e-03 9.017e-04 -4.499 6.82e-06 ***
## hourly_downwind_ref -4.650e-02 1.111e-02 -4.187 2.82e-05 ***
## hourly_downwind_wrp 4.576e-02 1.283e-02 3.565 0.000363 ***
## hourly_temp -3.825e-02 6.815e-04 -56.123 < 2e-16 ***
## hourly_hum -1.149e-02 2.423e-04 -47.413 < 2e-16 ***
## hourly_precip 1.078e+00 2.070e-01 5.206 1.93e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.351 3.000 0.002
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.253 8.253 62.118
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.976 80.000 130.311
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 126/132
## R-sq.(adj) = 0.175 Deviance explained = 17.5%
## GCV = 1.4048 Scale est. = 1.4037 n = 153718
Disaster Only
# Disaster only
summary(hm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp",
## k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + month +
## weekday + wd_avg + ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) +
## I(1/dist_dc^2) + monthly_oil_2km + monthly_gas_2km + active_2km +
## inactive_2km + elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_wrp + hourly_temp + hourly_hum
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.751e+02 5.365e+02 1.445 0.148540
## month11 -1.138e+01 1.069e+01 -1.065 0.286956
## month12 -6.246e-01 1.157e+01 -0.054 0.956966
## weekdayMon -4.502e+00 5.492e+00 -0.820 0.412317
## weekdayTue -1.019e+01 5.510e+00 -1.850 0.064385 .
## weekdayWed -7.691e-01 5.439e+00 -0.141 0.887550
## weekdayThu 5.550e+00 5.432e+00 1.022 0.306954
## weekdayFri 9.091e+00 5.312e+00 1.711 0.087021 .
## weekdaySat 7.882e+00 5.374e+00 1.467 0.142518
## wd_avg -7.400e-02 1.474e-02 -5.020 5.21e-07 ***
## ws_avg 3.090e+00 7.859e-01 3.932 8.45e-05 ***
## I(1/dist_wrp^2) -1.485e-03 3.691e-04 -4.023 5.76e-05 ***
## I(1/dist_ref^2) -3.664e-03 6.026e-03 -0.608 0.543217
## I(1/dist_dc^2) 1.315e+01 3.620e+00 3.632 0.000282 ***
## monthly_oil_2km -3.991e-02 1.803e-02 -2.213 0.026905 *
## monthly_gas_2km -6.194e-02 4.771e-02 -1.298 0.194215
## active_2km -2.612e+01 4.175e+00 -6.257 3.97e-10 ***
## inactive_2km 1.629e+02 1.456e+01 11.184 < 2e-16 ***
## elevation -8.281e+01 3.893e+00 -21.274 < 2e-16 ***
## EVI -1.337e+03 9.018e+01 -14.826 < 2e-16 ***
## num_odor_complaints -2.810e+00 1.361e-01 -20.642 < 2e-16 ***
## closest_wrp_capacity 2.611e+00 9.956e-01 2.623 0.008732 **
## hourly_downwind_wrp 1.212e+01 5.625e+00 2.155 0.031166 *
## hourly_temp -1.725e+00 2.936e-01 -5.876 4.25e-09 ***
## hourly_hum -2.430e-01 8.825e-02 -2.754 0.005899 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.723 8.939 92.02
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 78.248 80.000 68.80
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.223 Deviance explained = 22.6%
## GCV = 62223 Scale est. = 61999 n = 30242
Exclude Disaster
# Exclude disaster
summary(hm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## active_2km + inactive_2km + elevation + EVI + num_odor_complaints +
## closest_wrp_capacity + hourly_downwind_ref + hourly_downwind_wrp +
## hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.053e+00 7.042e+00 1.143 0.25284
## month02 -3.310e-01 4.340e+00 -0.076 0.93921
## month03 -4.146e-01 7.819e+00 -0.053 0.95771
## month04 -2.932e-01 1.044e+01 -0.028 0.97759
## month05 -2.902e-01 1.219e+01 -0.024 0.98101
## month06 -2.612e-01 1.308e+01 -0.020 0.98406
## month07 -7.371e-02 1.308e+01 -0.006 0.99550
## month08 1.403e-01 1.219e+01 0.012 0.99082
## month09 1.825e-01 1.044e+01 0.017 0.98605
## month10 6.698e-03 7.819e+00 0.001 0.99932
## month11 -5.259e-02 4.340e+00 -0.012 0.99033
## month12 3.365e-02 1.446e-02 2.327 0.01998 *
## weekdayMon 1.364e-01 8.303e-03 16.431 < 2e-16 ***
## weekdayTue 2.178e-01 8.235e-03 26.448 < 2e-16 ***
## weekdayWed 2.374e-01 8.227e-03 28.860 < 2e-16 ***
## weekdayThu 2.182e-01 8.226e-03 26.529 < 2e-16 ***
## weekdayFri 2.048e-01 8.228e-03 24.886 < 2e-16 ***
## weekdaySat 9.257e-02 8.213e-03 11.272 < 2e-16 ***
## wd_avg -3.425e-04 2.610e-05 -13.121 < 2e-16 ***
## ws_avg -4.396e-02 8.727e-04 -50.375 < 2e-16 ***
## I(1/dist_wrp^2) -6.324e-01 2.381e-02 -26.557 < 2e-16 ***
## I(1/dist_ref^2) 5.842e+00 2.200e-01 26.557 < 2e-16 ***
## I(1/dist_dc^2) 4.251e+03 1.601e+02 26.557 < 2e-16 ***
## active_2km 7.428e-03 1.426e-03 5.210 1.89e-07 ***
## inactive_2km -4.958e-02 4.421e-03 -11.214 < 2e-16 ***
## elevation -4.963e-03 1.910e-03 -2.598 0.00937 **
## EVI -1.109e+00 8.058e-02 -13.761 < 2e-16 ***
## num_odor_complaints 1.379e-02 6.403e-04 21.532 < 2e-16 ***
## closest_wrp_capacity -7.233e-03 4.527e-04 -15.976 < 2e-16 ***
## hourly_downwind_ref -6.319e-02 7.897e-03 -8.002 1.23e-15 ***
## hourly_downwind_wrp 4.382e-02 8.722e-03 5.025 5.05e-07 ***
## hourly_temp -3.845e-02 4.771e-04 -80.601 < 2e-16 ***
## hourly_hum -1.090e-02 1.638e-04 -66.542 < 2e-16 ***
## hourly_precip -7.501e-02 1.949e-01 -0.385 0.70030
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -1.219e-07 3.000 0.0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.925e+00 8.989 219.8
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.969e+01 80.000 214.2
## p-value
## s(as.numeric(month)) 1
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/131
## R-sq.(adj) = 0.148 Deviance explained = 14.9%
## GCV = 1.6319 Scale est. = 1.6313 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(hm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.265e+02 9.130e+01 -3.577 0.000348 ***
## month02 -4.106e-02 5.926e+01 -0.001 0.999447
## month03 -6.275e-01 1.068e+02 -0.006 0.995310
## month04 -4.214e-01 1.425e+02 -0.003 0.997641
## month05 -8.743e-01 1.665e+02 -0.005 0.995810
## month06 5.939e-01 1.785e+02 0.003 0.997346
## month07 9.327e-01 1.785e+02 0.005 0.995831
## month08 2.005e+00 1.665e+02 0.012 0.990392
## month09 3.129e+00 1.425e+02 0.022 0.982481
## month10 9.828e+00 1.068e+02 0.092 0.926656
## month11 -7.470e+00 5.926e+01 -0.126 0.899701
## month12 -7.052e+00 8.498e-01 -8.298 < 2e-16 ***
## weekdayMon -1.332e+00 4.925e-01 -2.704 0.006846 **
## weekdayTue -1.195e+00 4.886e-01 -2.445 0.014492 *
## weekdayWed -6.115e-01 4.881e-01 -1.253 0.210237
## weekdayThu -1.655e-01 4.879e-01 -0.339 0.734445
## weekdayFri 5.652e-01 4.874e-01 1.160 0.246200
## weekdaySat 4.511e-01 4.872e-01 0.926 0.354507
## wd_avg -1.638e-02 1.523e-03 -10.759 < 2e-16 ***
## ws_avg 3.331e-01 5.199e-02 6.407 1.49e-10 ***
## I(1/dist_wrp^2) 8.692e-06 8.182e-06 1.062 0.288079
## I(1/dist_ref^2) -6.508e-03 8.856e-04 -7.349 2.00e-13 ***
## I(1/dist_dc^2) 5.031e-01 6.269e-02 8.025 1.02e-15 ***
## monthly_oil_2km 6.642e-04 1.872e-04 3.548 0.000388 ***
## monthly_gas_2km 5.620e-06 3.777e-04 0.015 0.988130
## active_2km 5.131e-01 8.860e-02 5.791 7.00e-09 ***
## inactive_2km 5.517e+00 2.490e-01 22.156 < 2e-16 ***
## elevation -2.927e+00 1.127e-01 -25.966 < 2e-16 ***
## EVI -1.163e+02 4.656e+00 -24.981 < 2e-16 ***
## num_odor_complaints 1.270e+00 2.143e-02 59.263 < 2e-16 ***
## closest_wrp_capacity 8.840e-01 3.090e-02 28.605 < 2e-16 ***
## hourly_downwind_ref -4.312e+00 4.602e-01 -9.369 < 2e-16 ***
## hourly_downwind_wrp 2.998e+00 5.164e-01 5.806 6.42e-09 ***
## hourly_temp -4.508e-01 2.725e-02 -16.542 < 2e-16 ***
## hourly_hum -1.120e-01 9.152e-03 -12.233 < 2e-16 ***
## disaster 1.667e+01 1.198e+00 13.919 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -5.224e-09 2 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 90.65
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.719e+01 80 82.80
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 130/133
## R-sq.(adj) = 0.0508 Deviance explained = 5.11%
## GCV = 6255 Scale est. = 6253 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(hm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_hourly_max ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.183e+02 9.133e+01 -3.486 0.000491 ***
## month02 -2.781e-01 5.928e+01 -0.005 0.996257
## month03 -9.187e-01 1.068e+02 -0.009 0.993136
## month04 -7.947e-01 1.426e+02 -0.006 0.995552
## month05 -1.268e+00 1.665e+02 -0.008 0.993926
## month06 3.780e-01 1.786e+02 0.002 0.998311
## month07 1.063e+00 1.786e+02 0.006 0.995251
## month08 2.511e+00 1.665e+02 0.015 0.987968
## month09 3.783e+00 1.426e+02 0.027 0.978831
## month10 1.635e+01 1.068e+02 0.153 0.878285
## month11 -1.235e+00 5.928e+01 -0.021 0.983383
## month12 -1.170e+00 7.385e-01 -1.584 0.113133
## weekdayMon -1.328e+00 4.927e-01 -2.696 0.007023 **
## weekdayTue -1.183e+00 4.888e-01 -2.420 0.015536 *
## weekdayWed -6.132e-01 4.882e-01 -1.256 0.209132
## weekdayThu -1.974e-01 4.881e-01 -0.404 0.685861
## weekdayFri 5.322e-01 4.875e-01 1.092 0.274972
## weekdaySat 4.249e-01 4.873e-01 0.872 0.383216
## wd_avg -1.649e-02 1.525e-03 -10.812 < 2e-16 ***
## ws_avg 3.342e-01 5.246e-02 6.370 1.89e-10 ***
## I(1/dist_wrp^2) 1.135e-05 8.398e-06 1.352 0.176470
## I(1/dist_ref^2) -6.539e-03 8.889e-04 -7.356 1.89e-13 ***
## I(1/dist_dc^2) 4.948e-01 6.276e-02 7.884 3.18e-15 ***
## monthly_oil_2km 7.443e-04 1.879e-04 3.961 7.46e-05 ***
## monthly_gas_2km 2.943e-05 3.845e-04 0.077 0.938982
## active_2km 4.500e-01 8.862e-02 5.078 3.82e-07 ***
## inactive_2km 5.624e+00 2.494e-01 22.547 < 2e-16 ***
## elevation -2.921e+00 1.129e-01 -25.876 < 2e-16 ***
## EVI -1.157e+02 4.659e+00 -24.838 < 2e-16 ***
## num_odor_complaints 1.287e+00 2.140e-02 60.123 < 2e-16 ***
## closest_wrp_capacity 8.927e-01 3.096e-02 28.832 < 2e-16 ***
## hourly_downwind_ref -4.414e+00 4.605e-01 -9.585 < 2e-16 ***
## hourly_downwind_wrp 3.074e+00 5.166e-01 5.951 2.66e-09 ***
## hourly_temp -4.566e-01 2.730e-02 -16.726 < 2e-16 ***
## hourly_hum -1.164e-01 9.231e-03 -12.609 < 2e-16 ***
## hourly_precip -3.859e+00 1.063e+01 -0.363 0.716519
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) -1.072e-09 2 0.00
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 91.60
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.752e+01 80 80.74
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 130/133
## R-sq.(adj) = 0.0503 Deviance explained = 5.06%
## GCV = 6258.3 Scale est. = 6256.3 n = 367838
Since February 2022
# Since feb 2022
summary(log_hm_sincefeb2022_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + hourly_downwind_ref +
## hourly_downwind_wrp + hourly_temp + hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.311e+00 3.070e-01 10.782 < 2e-16 ***
## month02 4.794e-02 1.802e-01 0.266 0.790211
## month03 2.318e-01 3.241e-01 0.715 0.474487
## month04 3.997e-01 4.325e-01 0.924 0.355314
## month05 2.830e-01 5.050e-01 0.560 0.575158
## month06 1.408e-01 5.414e-01 0.260 0.794748
## month07 1.918e-03 5.413e-01 0.004 0.997173
## month08 4.657e-03 5.047e-01 0.009 0.992638
## month09 1.255e-01 4.321e-01 0.290 0.771517
## month10 -4.471e-02 3.241e-01 -0.138 0.890269
## month11 -1.357e-01 1.806e-01 -0.751 0.452472
## month12 -2.053e-01 1.400e-02 -14.671 < 2e-16 ***
## weekdayMon 1.154e-01 6.146e-03 18.768 < 2e-16 ***
## weekdayTue 1.813e-01 6.092e-03 29.755 < 2e-16 ***
## weekdayWed 1.742e-01 6.107e-03 28.521 < 2e-16 ***
## weekdayThu 1.290e-01 6.110e-03 21.111 < 2e-16 ***
## weekdayFri 1.431e-01 6.088e-03 23.502 < 2e-16 ***
## weekdaySat 8.866e-02 6.071e-03 14.603 < 2e-16 ***
## wd_avg -4.300e-04 1.953e-05 -22.020 < 2e-16 ***
## ws_avg -6.787e-02 7.607e-04 -89.220 < 2e-16 ***
## I(1/dist_wrp^2) 7.169e-07 7.728e-08 9.277 < 2e-16 ***
## I(1/dist_ref^2) -2.919e-07 2.118e-06 -0.138 0.890390
## I(1/dist_dc^2) -5.394e-04 2.691e-05 -20.042 < 2e-16 ***
## monthly_oil_2km -3.318e-06 2.403e-06 -1.381 0.167333
## monthly_gas_2km -3.821e-05 6.994e-06 -5.463 4.69e-08 ***
## active_2km 2.169e-02 1.925e-03 11.270 < 2e-16 ***
## inactive_2km -1.578e-02 4.027e-03 -3.917 8.96e-05 ***
## elevation -3.947e-02 3.147e-03 -12.543 < 2e-16 ***
## EVI -2.344e+00 7.236e-02 -32.393 < 2e-16 ***
## num_odor_complaints 1.751e-02 7.962e-04 21.997 < 2e-16 ***
## hourly_downwind_ref -6.058e-03 5.971e-03 -1.015 0.310298
## hourly_downwind_wrp 7.632e-02 6.900e-03 11.060 < 2e-16 ***
## hourly_temp -3.136e-02 3.665e-04 -85.567 < 2e-16 ***
## hourly_hum -1.017e-02 1.303e-04 -78.072 < 2e-16 ***
## hourly_precip 3.703e-01 1.113e-01 3.328 0.000876 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 2.618e-11 0 Inf
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000e+00 9 227.2
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01 80 896.8
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/132
## R-sq.(adj) = 0.496 Deviance explained = 49.7%
## GCV = 0.40612 Scale est. = 0.4058 n = 153718
Disaster Only
# Disaster only
summary(log_hm_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + month + weekday + wd_avg + ws_avg + I(1/dist_wrp^2) +
## I(1/dist_ref^2) + I(1/dist_dc^2) + monthly_oil_2km + active_2km +
## inactive_2km + elevation + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.300e+01 2.000e+00 -11.498 < 2e-16 ***
## month11 1.587e-01 3.764e-02 4.216 2.50e-05 ***
## month12 -7.281e-02 3.887e-02 -1.873 0.061078 .
## weekdayMon 5.560e-02 1.941e-02 2.864 0.004188 **
## weekdayTue -7.745e-02 1.948e-02 -3.976 7.04e-05 ***
## weekdayWed 1.200e-01 1.923e-02 6.242 4.37e-10 ***
## weekdayThu 1.086e-01 1.940e-02 5.597 2.20e-08 ***
## weekdayFri 3.229e-02 1.879e-02 1.719 0.085638 .
## weekdaySat -7.210e-02 1.900e-02 -3.795 0.000148 ***
## wd_avg -7.589e-04 5.254e-05 -14.443 < 2e-16 ***
## ws_avg -1.126e-01 2.863e-03 -39.351 < 2e-16 ***
## I(1/dist_wrp^2) -4.404e-05 3.692e-06 -11.929 < 2e-16 ***
## I(1/dist_ref^2) -2.540e-04 2.117e-05 -11.999 < 2e-16 ***
## I(1/dist_dc^2) 3.175e-01 2.604e-02 12.191 < 2e-16 ***
## monthly_oil_2km 6.092e-04 4.319e-05 14.105 < 2e-16 ***
## active_2km -4.152e-01 1.346e-02 -30.847 < 2e-16 ***
## inactive_2km 7.570e-01 4.627e-02 16.360 < 2e-16 ***
## elevation -2.215e-01 1.232e-02 -17.975 < 2e-16 ***
## num_odor_complaints 4.079e-03 4.842e-04 8.424 < 2e-16 ***
## closest_wrp_capacity 6.850e-02 3.809e-03 17.985 < 2e-16 ***
## hourly_downwind_ref -1.588e-01 1.574e-02 -10.090 < 2e-16 ***
## hourly_downwind_wrp 2.311e-01 2.005e-02 11.528 < 2e-16 ***
## hourly_temp -3.327e-02 1.039e-03 -32.031 < 2e-16 ***
## hourly_hum -9.715e-03 3.157e-04 -30.775 < 2e-16 ***
## hourly_precip 1.019e+00 2.722e-01 3.744 0.000181 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 238.98
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.36 80 89.27
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 111/114
## R-sq.(adj) = 0.56 Deviance explained = 56.2%
## GCV = 0.77762 Scale est. = 0.77478 n = 30242
Exclude Disaster
# Exclude disaster
summary(log_hm_excl_dis_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.664e+00 2.381e-01 19.586 < 2e-16 ***
## month02 -3.220e-01 1.211e-01 -2.658 0.007854 **
## month03 -3.962e-01 2.180e-01 -1.817 0.069213 .
## month04 -2.329e-01 2.910e-01 -0.800 0.423594
## month05 -1.459e-01 3.400e-01 -0.429 0.667782
## month06 -1.049e-01 3.646e-01 -0.288 0.773596
## month07 9.324e-02 3.646e-01 0.256 0.798137
## month08 3.447e-01 3.400e-01 1.014 0.310661
## month09 3.688e-01 2.911e-01 1.267 0.205198
## month10 4.764e-02 2.182e-01 0.218 0.827147
## month11 2.540e-02 1.213e-01 0.209 0.834128
## month12 2.105e-02 7.852e-03 2.681 0.007343 **
## weekdayMon 1.195e-01 4.498e-03 26.560 < 2e-16 ***
## weekdayTue 1.722e-01 4.461e-03 38.609 < 2e-16 ***
## weekdayWed 1.947e-01 4.457e-03 43.692 < 2e-16 ***
## weekdayThu 1.762e-01 4.456e-03 39.530 < 2e-16 ***
## weekdayFri 1.641e-01 4.458e-03 36.824 < 2e-16 ***
## weekdaySat 8.189e-02 4.449e-03 18.406 < 2e-16 ***
## wd_avg -6.385e-04 1.414e-05 -45.147 < 2e-16 ***
## ws_avg -4.274e-02 4.729e-04 -90.360 < 2e-16 ***
## I(1/dist_wrp^2) 1.673e-06 1.266e-07 13.216 < 2e-16 ***
## I(1/dist_ref^2) -6.065e-05 5.807e-06 -10.444 < 2e-16 ***
## I(1/dist_dc^2) 6.920e-04 2.075e-04 3.335 0.000852 ***
## monthly_oil_2km -6.836e-06 1.841e-06 -3.714 0.000204 ***
## monthly_gas_2km -5.265e-05 3.598e-06 -14.633 < 2e-16 ***
## active_2km 2.082e-02 8.355e-04 24.920 < 2e-16 ***
## inactive_2km -3.070e-02 2.436e-03 -12.600 < 2e-16 ***
## elevation 8.180e-04 1.025e-03 0.798 0.424883
## EVI -2.091e+00 4.337e-02 -48.211 < 2e-16 ***
## num_odor_complaints 4.475e-03 3.470e-04 12.896 < 2e-16 ***
## closest_wrp_capacity -4.812e-03 2.976e-04 -16.166 < 2e-16 ***
## hourly_downwind_ref -3.486e-02 4.275e-03 -8.155 3.51e-16 ***
## hourly_downwind_wrp 8.342e-02 4.729e-03 17.641 < 2e-16 ***
## hourly_temp -2.969e-02 2.585e-04 -114.847 < 2e-16 ***
## hourly_hum -8.471e-03 8.876e-05 -95.440 < 2e-16 ***
## hourly_precip -6.580e-01 1.056e-01 -6.233 4.59e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 4.462e-10 1 0
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.999e+00 9 1103
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.999e+01 80 1352
## p-value
## s(as.numeric(month)) 0.641
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 130/133
## R-sq.(adj) = 0.41 Deviance explained = 41%
## GCV = 0.47893 Scale est. = 0.47876 n = 337596
Everything w Disaster Indicator
# Disaster indicator
summary(log_hm_dis_ind_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.223e+00 1.391e-01 15.976 < 2e-16 ***
## month02 -3.453e-01 2.860e-02 -12.076 < 2e-16 ***
## month03 -4.884e-01 5.069e-02 -9.634 < 2e-16 ***
## month04 -3.897e-01 6.752e-02 -5.772 7.82e-09 ***
## month05 -3.310e-01 7.875e-02 -4.204 2.63e-05 ***
## month06 -2.829e-01 8.434e-02 -3.355 0.000795 ***
## month07 -6.864e-02 8.421e-02 -0.815 0.415043
## month08 2.333e-01 7.851e-02 2.972 0.002961 **
## month09 3.135e-01 6.733e-02 4.655 3.24e-06 ***
## month10 1.756e-01 5.094e-02 3.447 0.000566 ***
## month11 -9.293e-04 2.917e-02 -0.032 0.974588
## month12 -1.378e-01 8.008e-03 -17.210 < 2e-16 ***
## weekdayMon 1.077e-01 4.628e-03 23.262 < 2e-16 ***
## weekdayTue 1.450e-01 4.592e-03 31.580 < 2e-16 ***
## weekdayWed 1.853e-01 4.586e-03 40.400 < 2e-16 ***
## weekdayThu 1.656e-01 4.586e-03 36.119 < 2e-16 ***
## weekdayFri 1.501e-01 4.580e-03 32.779 < 2e-16 ***
## weekdaySat 6.817e-02 4.578e-03 14.891 < 2e-16 ***
## wd_avg -7.101e-04 1.433e-05 -49.557 < 2e-16 ***
## ws_avg -4.462e-02 4.930e-04 -90.502 < 2e-16 ***
## I(1/dist_wrp^2) 7.858e-07 1.094e-07 7.180 6.99e-13 ***
## I(1/dist_ref^2) -7.458e-05 5.143e-06 -14.502 < 2e-16 ***
## I(1/dist_dc^2) 6.322e-03 2.534e-04 24.949 < 2e-16 ***
## monthly_oil_2km -1.097e-05 1.796e-06 -6.106 1.03e-09 ***
## monthly_gas_2km -9.860e-06 3.989e-06 -2.472 0.013439 *
## active_2km 2.362e-02 8.525e-04 27.711 < 2e-16 ***
## inactive_2km 7.026e-03 2.458e-03 2.858 0.004263 **
## elevation -2.521e-02 1.064e-03 -23.692 < 2e-16 ***
## EVI -2.889e+00 4.382e-02 -65.915 < 2e-16 ***
## num_odor_complaints 1.345e-02 2.015e-04 66.764 < 2e-16 ***
## closest_wrp_capacity 1.569e-03 2.938e-04 5.341 9.23e-08 ***
## hourly_downwind_ref -6.705e-02 4.327e-03 -15.497 < 2e-16 ***
## hourly_downwind_wrp 1.039e-01 4.854e-03 21.407 < 2e-16 ***
## hourly_temp -3.271e-02 2.565e-04 -127.507 < 2e-16 ***
## hourly_hum -9.063e-03 8.679e-05 -104.426 < 2e-16 ***
## hourly_precip -8.261e-01 9.986e-02 -8.273 < 2e-16 ***
## disaster 5.243e-01 1.127e-02 46.530 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df
## s(as.numeric(month)) -1.493e-11 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.974e+00 8.999
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 7.992e+01 80.000
## F p-value
## s(as.numeric(month)) Inf <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 474.2 <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 1297.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 131/134
## R-sq.(adj) = 0.412 Deviance explained = 41.3%
## GCV = 0.5523 Scale est. = 0.55212 n = 367838
Everything w.o Disaster Indicator
# Everything
summary(log_hm_full_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log(H2S_hourly_max) ~ s(as.numeric(month), bs = "cc") + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + month + weekday + wd_avg +
## ws_avg + I(1/dist_wrp^2) + I(1/dist_ref^2) + I(1/dist_dc^2) +
## monthly_oil_2km + monthly_gas_2km + active_2km + inactive_2km +
## elevation + EVI + num_odor_complaints + closest_wrp_capacity +
## hourly_downwind_ref + hourly_downwind_wrp + hourly_temp +
## hourly_hum + hourly_precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.285e+00 2.731e-01 8.365 < 2e-16 ***
## month02 -1.936e-01 7.609e-01 -0.254 0.7992
## month03 -1.795e-01 9.584e-01 -0.187 0.8514
## month04 2.064e-02 7.835e-01 0.026 0.9790
## month05 9.684e-02 6.327e-01 0.153 0.8784
## month06 8.011e-02 6.945e-01 0.115 0.9082
## month07 1.716e-01 6.937e-01 0.247 0.8046
## month08 3.306e-01 6.314e-01 0.524 0.6006
## month09 2.860e-01 7.831e-01 0.365 0.7149
## month10 2.670e-01 9.584e-01 0.279 0.7806
## month11 9.732e-02 7.609e-01 0.128 0.8982
## month12 4.738e-02 6.970e-03 6.798 1.07e-11 ***
## weekdayMon 1.077e-01 4.642e-03 23.201 < 2e-16 ***
## weekdayTue 1.453e-01 4.605e-03 31.551 < 2e-16 ***
## weekdayWed 1.852e-01 4.600e-03 40.259 < 2e-16 ***
## weekdayThu 1.644e-01 4.599e-03 35.754 < 2e-16 ***
## weekdayFri 1.490e-01 4.593e-03 32.436 < 2e-16 ***
## weekdaySat 6.734e-02 4.591e-03 14.667 < 2e-16 ***
## wd_avg -7.118e-04 1.437e-05 -49.531 < 2e-16 ***
## ws_avg -4.471e-02 4.944e-04 -90.420 < 2e-16 ***
## I(1/dist_wrp^2) 1.040e-06 1.089e-07 9.546 < 2e-16 ***
## I(1/dist_ref^2) -8.449e-05 5.591e-06 -15.112 < 2e-16 ***
## I(1/dist_dc^2) 6.098e-03 2.898e-04 21.041 < 2e-16 ***
## monthly_oil_2km -8.581e-06 1.801e-06 -4.764 1.90e-06 ***
## monthly_gas_2km -9.137e-06 4.005e-06 -2.281 0.0225 *
## active_2km 2.129e-02 8.545e-04 24.909 < 2e-16 ***
## inactive_2km 1.214e-02 2.470e-03 4.913 8.97e-07 ***
## elevation -2.497e-02 1.067e-03 -23.389 < 2e-16 ***
## EVI -2.869e+00 4.397e-02 -65.257 < 2e-16 ***
## num_odor_complaints 1.399e-02 2.017e-04 69.367 < 2e-16 ***
## closest_wrp_capacity 1.921e-03 2.949e-04 6.516 7.25e-11 ***
## hourly_downwind_ref -7.015e-02 4.339e-03 -16.168 < 2e-16 ***
## hourly_downwind_wrp 1.062e-01 4.868e-03 21.813 < 2e-16 ***
## hourly_temp -3.290e-02 2.572e-04 -127.912 < 2e-16 ***
## hourly_hum -9.221e-03 8.698e-05 -106.018 < 2e-16 ***
## hourly_precip -7.941e-01 1.002e-01 -7.929 2.22e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.354 4 0.004
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.983 9 478.832
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.974 80 1272.465
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 128/133
## R-sq.(adj) = 0.409 Deviance explained = 40.9%
## GCV = 0.55555 Scale est. = 0.55536 n = 367838
adj_r2 <- function(r2, n, p){
# n-p since p here includes intercept
return(1 - (1-r2)*(n - 1)/(n - p))
}
get_bt_adj_r2 <- function(name, response, daterange) {
data <- get_data(response, daterange)
model <- get(paste0(name,'_', daterange, '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data)
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
bt_adj_r2 <- adj_r2(bt_R2, summary(model)$n, summary(model)$np)
return(bt_adj_r2)
}
# comp r2
data <- get_data('log(H2S_daily_avg)', 'dis_ind')
model <- get(paste0('log_da','_', 'dis_ind', '_gam'))
response_colname <- names(model$model)[1]
response_colname <- str_sub(response_colname, 5, -2)
predictions <- predict(model, newdata = data)
bt_R2 <- R2(data %>% pull(response_colname), exp(predictions))
print('R2 computed using R2() function from caret package')
## [1] "R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname) %>% log(), predictions)
## [1] 0.5544955
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname) %>% log(), predictions),
summary(model)$n, summary(model)$np)
## [1] 0.5508375
print('BT-R2 computed using R2() function from caret package')
## [1] "BT-R2 computed using R2() function from caret package"
R2(data %>% pull(response_colname), exp(predictions))
## [1] 0.0004193897
print('After adjusting for number of predictors')
## [1] "After adjusting for number of predictors"
adj_r2(R2(data %>% pull(response_colname), exp(predictions)),
summary(model)$n, summary(model)$np)
## [1] -0.007788197
print('Returned by get_bt_adj_r2')
## [1] "Returned by get_bt_adj_r2"
get_bt_adj_r2('log_da', 'log(H2S_daily_avg)', 'dis_ind')
## [1] -0.007788197
print('Fit obs(y) ~ exp(predicted y)')
## [1] "Fit obs(y) ~ exp(predicted y)"
model_r2 <- lm(data %>% pull(response_colname) ~ exp(predictions))
summary(model_r2)$r.sq
## [1] 0.0004193897
summary(model_r2)$df[1] + summary(model_r2)$df[2]
## [1] 15595
summary(model_r2)$df[1]
## [1] 2
summary(model_r2)$adj.r.squared
## [1] 0.0003552853
adj_r2(summary(model_r2)$r.sq,
summary(model_r2)$df[1] + summary(model_r2)$df[2],
summary(model_r2)$df[1])
## [1] 0.0003552853
date_names <- c('Since Feb 2022', 'Disaster Only', 'Exclude Disaster',
'Everything w. Disaster Indicator',
'Everything w.o Disaster Indicator')
response_disp_names <- c('Daily Avg', 'Log Daily Avg', 'Daily Max', 'Log Daily Max',
'Hourly Avg', 'Log Hourly Avg', 'Hourly Max', 'Log Hourly Max')
gam_result_table <- expand.grid(date_names, response_disp_names) %>%
setNames(c('date_names', 'response_disp_names'))
date_name_conversion <- tibble(date_names = date_names,
daterange = dateranges)
response_name_conversion <- tibble(response_disp_names = unique(gam_result_table$response_disp_names),
response_obj_name = c(daily_responses, hourly_responses),
model_response_name = response_names,
transformation = rep(c('', 'Log'), 4))
gam_result_table <- gam_result_table %>%
left_join(date_name_conversion) %>%
left_join(response_name_conversion)
## Joining with `by = join_by(date_names)`
## Joining with `by = join_by(response_disp_names)`
gam_result_table <- gam_result_table %>%
mutate(adjr2 = NA,
bt_adjr2 = NA,
n = NA,
p = NA)
for (i in 1:nrow(gam_result_table)) {
name <- gam_result_table$model_response_name[i]
response <- gam_result_table$response_obj_name[i]
daterange <- gam_result_table$daterange[i]
model <- get(paste0(name,'_', daterange, '_gam'))
if (str_detect(response, 'log\\(')) {
bt_adjr2 <- get_bt_adj_r2(name, response, daterange)
} else {
bt_adjr2 <- NA
}
adjr2 <- summary(model)$r.sq
n <- summary(model)$n
p <- summary(model)$np
new_columns <- tibble(adjr2 = adjr2, bt_adjr2 = bt_adjr2, n = n, p = p)
new_row <- bind_cols(gam_result_table[i, 1:6], new_columns)
gam_result_table[i, ] <- new_row
print(str_glue('Completed {i} iterations'))
}
## Completed 1 iterations
## Completed 2 iterations
## Completed 3 iterations
## Completed 4 iterations
## Completed 5 iterations
## Completed 6 iterations
## Completed 7 iterations
## Completed 8 iterations
## Completed 9 iterations
## Completed 10 iterations
## Completed 11 iterations
## Completed 12 iterations
## Completed 13 iterations
## Completed 14 iterations
## Completed 15 iterations
## Completed 16 iterations
## Completed 17 iterations
## Completed 18 iterations
## Completed 19 iterations
## Completed 20 iterations
## Completed 21 iterations
## Completed 22 iterations
## Completed 23 iterations
## Completed 24 iterations
## Completed 25 iterations
## Completed 26 iterations
## Completed 27 iterations
## Completed 28 iterations
## Completed 29 iterations
## Completed 30 iterations
## Completed 31 iterations
## Completed 32 iterations
## Completed 33 iterations
## Completed 34 iterations
## Completed 35 iterations
## Completed 36 iterations
## Completed 37 iterations
## Completed 38 iterations
## Completed 39 iterations
## Completed 40 iterations
temp <- rep(rep(response_disp_names[!str_detect(response_disp_names, 'Log')], each =2), 5)
base_table <- gam_result_table %>%
arrange(factor(date_names, levels = .env$date_names)) %>%
mutate(response_base = temp,
`bt_adjr2` = '') %>%
filter(transformation == '') %>%
select(all_of(c('date_names', 'response_base', 'model_response_name', 'adjr2', 'bt_adjr2', 'n', 'p'))) %>%
select(-bt_adjr2)
log_table <- gam_result_table %>%
arrange(factor(date_names, levels = .env$date_names)) %>%
mutate(response_base = temp) %>%
filter(transformation == 'Log') %>%
select(all_of(c('date_names', 'response_base', 'model_response_name','adjr2', 'bt_adjr2', 'n', 'p')))
gam_result_table_fordisp <- base_table %>%
left_join(log_table, join_by(date_names, response_base)) %>%
select(-'date_names', -starts_with('model_response_name')) %>%
setNames(c('Response', c('Adj.R2', 'N', 'P'), c('Adj.R2', 'BT-Adj.R2', 'N', 'P')))
gam_result_table_kable <- gam_result_table_fordisp %>%
knitr::kable(format = 'latex', digits = 2) %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')
gam_result_table_fordisp %>%
knitr::kable(format = 'html', digits = 2, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 3, 'Log-Transformation' = 4))
| Response | Adj.R2 | N | P | Adj.R2 | BT-Adj.R2 | N | P |
|---|---|---|---|---|---|---|---|
| Since Feb 2022 | |||||||
| Daily Avg | 0.62 | 6531 | 127 | 0.70 | 0.62 | 6531 | 126 |
| Daily Max | 0.21 | 6531 | 126 | 0.54 | 0.22 | 6531 | 125 |
| Hourly Avg | 0.35 | 153718 | 133 | 0.55 | 0.34 | 153718 | 132 |
| Hourly Max | 0.17 | 153718 | 132 | 0.50 | 0.16 | 153718 | 132 |
| Disaster Only | |||||||
| Daily Avg | 0.42 | 1273 | 105 | 0.75 | 0.55 | 1273 | 111 |
| Daily Max | 0.43 | 1273 | 99 | 0.72 | 0.44 | 1273 | 110 |
| Hourly Avg | 0.21 | 30242 | 114 | 0.58 | 0.22 | 30242 | 114 |
| Hourly Max | 0.22 | 30242 | 114 | 0.56 | 0.24 | 30242 | 114 |
| Exclude Disaster | |||||||
| Daily Avg | 0.46 | 14322 | 128 | 0.57 | 0.46 | 14322 | 129 |
| Daily Max | 0.12 | 14322 | 123 | 0.48 | 0.11 | 14322 | 128 |
| Hourly Avg | 0.25 | 337596 | 132 | 0.44 | 0.25 | 337596 | 133 |
| Hourly Max | 0.15 | 337596 | 131 | 0.41 | 0.14 | 337596 | 133 |
| Everything w D.I | |||||||
| Daily Avg | 0.11 | 15595 | 123 | 0.55 | -0.01 | 15595 | 128 |
| Daily Max | 0.12 | 15595 | 123 | 0.50 | -0.01 | 15595 | 128 |
| Hourly Avg | 0.05 | 367838 | 134 | 0.44 | 0.00 | 367838 | 133 |
| Hourly Max | 0.05 | 367838 | 133 | 0.41 | 0.00 | 367838 | 134 |
| Everything w.o D.I | |||||||
| Daily Avg | 0.11 | 15595 | 122 | 0.54 | -0.01 | 15595 | 127 |
| Daily Max | 0.12 | 15595 | 123 | 0.49 | -0.01 | 15595 | 128 |
| Hourly Avg | 0.05 | 367838 | 133 | 0.43 | 0.00 | 367838 | 132 |
| Hourly Max | 0.05 | 367838 | 133 | 0.41 | 0.00 | 367838 | 133 |
validation_result <- tibble(Model = character(),
model_response_name = character(),
daterange = character(),
'Coef' = character(),
'R-Sq' = numeric(),
'Disaster RMSE' = numeric(),
'Normal RMSE' = numeric())
xgb_result <- tibble(Model = character(),
model_response_name = character(),
daterange = character(),
'R-Sq' = numeric(),
'BT R-Sq' = numeric(),
'RMSE' = numeric(),
'BT RMSE' = numeric())
fit.xgb_da_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_sincefeb2022)
fit.xgb_da_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 700
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 141 700 6 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x",
"Latitude" = "mon_utm_y",
"Distance to Refinery" = "dist_ref",
"Angle to Refinery" = "angle_ref",
"Active Wells within 2km" = "active_2km",
"Inactive Wells within 2km" = "inactive_2km",
"Monthly Oil Production 2km" = "monthly_oil_2km",
"Monthly Gas Production 2km" = "monthly_gas_2km",
"Distance to WRP" = "dist_wrp",
"WRP Capacity" = "closest_wrp_capacity",
"Angle to WRP" = "angle_wrp",
"Distance to Dominguez Channel" = "dist_dc",
"Average Daily Temperature" = "daily_temp",
"Average Daily Humidity" = "daily_hum",
"Daily Precipitation" = "daily_precip",
"Average Daily Wind Speed" = "ws_avg",
"Average Daily Wind Direction" = "wd_avg",
"Downwind Refinery" = "daily_downwind_ref",
"Downwind WRP" = "daily_downwind_wrp",
"Elevation" = "elevation",
"Enhanced Vegetation Index" = "EVI",
"Number of Daily Odor Complaints" = "num_odor_complaints",
"2020" = "year_2020",
"2021" = "year_2021",
"2022" = "year_2022",
"2023" = "year_2023",
"January" = "month_01",
"February" = "month_02",
"March" = "month_03",
"April" = "month_04",
"May" = "month_05",
"June" = "month_06",
"July" = "month_07",
"August" = "month_08",
"September" = "month_09",
"October" = "month_10",
"November" = "month_11",
"December" = "month_12",
"Monday" = "weekday_Mon",
"Tuesday" = "weekday_Tue",
"Wednesday" = "weekday_Wed",
"Thursday" = "weekday_Thu",
"Friday" = "weekday_Fri",
"Saturday" = "weekday_Sat",
"Sunday" = "weekday_Sun",
"Disaster" = "disaster")
imp<-varImp(fit.xgb_da_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_sincefeb2022$pred$obs, pred = fit.xgb_da_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'da',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
fit.xgb_da_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_sincefeb2022$pred$obs ~
fit.xgb_da_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_sincefeb2022$trainingData),
fit.xgb_da_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'da',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 868.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 300
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 121 300 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_da_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_da_dis$pred$obs, pred = fit.xgb_da_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'da',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_da_dis$pred$obs ~
fit.xgb_da_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_dis$pred$obs ~
fit.xgb_da_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_dis$trainingData),
fit.xgb_da_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'da',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_da_excl_dis.rds')
getTrainPerf(fit.xgb_da_excl_dis)
fit.xgb_da_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 150 700 6 0.1 0.01 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_da_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_da_excl_dis$pred$obs,
pred = fit.xgb_da_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'da',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
fit.xgb_da_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_excl_dis$pred$obs ~
fit.xgb_da_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_excl_dis$trainingData),
fit.xgb_da_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'da',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_dis_ind.rds')
getTrainPerf(fit.xgb_da_dis_ind)
fit.xgb_da_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 186 700 4 0.3 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_dis_ind$pred$obs,
pred = fit.xgb_da_dis_ind$pred$pred,
disaster = fit.xgb_da_dis_ind$trainingData$disaster[fit.xgb_da_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_da_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'da',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
fit.xgb_da_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_dis_ind$pred$obs ~
fit.xgb_da_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_dis_ind$trainingData),
fit.xgb_da_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'da',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_da_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 243 700 5 0.3 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_full$pred$obs,
pred = fit.xgb_da_full$pred$pred,
disaster = if_else(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_da_full$trainingData[fit.xgb_da_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_da_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'da',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_full$trainingData),
fit.xgb_da_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'da',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
ggarrange(xgb_sincefeb2022_obs_vs_pred_plot,
ggarrange(xgb_disaster_obs_vs_pred_plot, xgb_disaster_obs_vs_pred_plot_zoom,
ncol = 2, labels = c("2", "3")),
ggarrange(xgb_everything_obs_vs_pred_plot, xgb_everything_obs_vs_pred_plot_zoom,
ncol = 2, labels = c("4", "5")),
labels = c("1"),
nrow = 3)
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): full precision may
## not have been achieved in 'pnbeta'
## Warning in stats::pf(stat, df1 = df1, df2 = df2, ncp = ncp): convergence failed
## in 'pnbeta'
ggarrange(xgb_excl_dis_obs_vs_pred_plot,
ggarrange(xgb_dis_ind_obs_vs_pred_plot, xgb_dis_ind_obs_vs_pred_plot_zoom,
ncol = 2, labels = c("3", "4")),
labels = c("1"),
nrow = 2)
knitr::kable(validation_result, digits = 3)
| Model | model_response_name | daterange | Coef | R-Sq | Disaster RMSE | Normal RMSE |
|---|---|---|---|---|---|---|
| Since Feb 2022 | da | sincefeb2022 | 1.012 | 0.759 | NA | NA |
| Disaster Only | da | dis | 1.172 | 0.642 | NA | NA |
| Exclude Disaster | da | excl_dis | 1.004 | 0.673 | NA | NA |
| Everything w. Disaster Indicator | da | dis_ind | 1.089 | 0.965 | 17.744 | 0.281 |
| Everything w.o Disaster Indicator | da | full | 1.049 | 0.989 | 9.919 | 0.137 |
fit.xgb_da_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_da_log_h2s_sincefeb2022)
fit.xgb_da_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 700
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123 700 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$obs),
pred = exp(fit.xgb_da_log_h2s_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_da',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_da_log_h2s_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData),
fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_sincefeb2022$trainingData),
fit.xgb_da_log_h2s_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_da',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis)
fit.xgb_da_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 700
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 69 700 5 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis$pred$obs),
pred = exp(fit.xgb_da_log_h2s_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_da',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
fit.xgb_da_log_h2s_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis$pred$obs ~
fit.xgb_da_log_h2s_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_dis$trainingData),
fit.xgb_da_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_dis$trainingData),
fit.xgb_da_log_h2s_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_da',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_da_log_h2s_excl_dis)
fit.xgb_da_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 123 700 6 0.1 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_excl_dis$pred$obs),
pred = exp(fit.xgb_da_log_h2s_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_da',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
fit.xgb_da_log_h2s_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_excl_dis$pred$obs ~
fit.xgb_da_log_h2s_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_excl_dis$trainingData),
fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_excl_dis$trainingData),
fit.xgb_da_log_h2s_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_da',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_da_log_h2s_dis_ind)
fit.xgb_da_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285 700 6 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_log_h2s_dis_ind$pred$obs,
pred = fit.xgb_da_log_h2s_dis_ind$pred$pred,
disaster = fit.xgb_da_log_h2s_dis_ind$trainingData$disaster[fit.xgb_da_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_dis_ind$pred$obs),
pred = exp(fit.xgb_da_log_h2s_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_da',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
fit.xgb_da_log_h2s_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_dis_ind$pred$obs ~
fit.xgb_da_log_h2s_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_dis_ind$trainingData),
fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_dis_ind$trainingData),
fit.xgb_da_log_h2s_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_da',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_da_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_da_log_h2s_full.rds')
getTrainPerf(fit.xgb_da_log_h2s_full)
fit.xgb_da_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 285 700 6 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_log_h2s_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_da_log_h2s_full$pred$obs,
pred = fit.xgb_da_log_h2s_full$pred$pred,
disaster = if_else(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_da_log_h2s_full$trainingData[fit.xgb_da_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_da_log_h2s_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_da_log_h2s_full$pred$obs),
pred = exp(fit.xgb_da_log_h2s_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_da',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
fit.xgb_da_log_h2s_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_da_log_h2s_full$pred$obs ~
fit.xgb_da_log_h2s_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_da_log_h2s_full$trainingData),
fit.xgb_da_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_da_log_h2s_full$trainingData),
fit.xgb_da_log_h2s_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_da',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_sincefeb2022)
fit.xgb_dm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 300
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 151 300 6 0.1 0.01 0.75 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_sincefeb2022$pred$obs, pred = fit.xgb_dm_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'dm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
fit.xgb_dm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_sincefeb2022$pred$obs ~
fit.xgb_dm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_sincefeb2022$trainingData),
fit.xgb_dm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'dm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis.rds')
getTrainPerf(fit.xgb_dm_dis)
fit.xgb_dm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 459.6 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.75", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 300
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 43 300 4 0.1 0.01 0.75 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_dm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_dm_dis$pred$obs, pred = fit.xgb_dm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'dm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_dm_dis$pred$obs ~
fit.xgb_dm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_dis$pred$obs ~
fit.xgb_dm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_dis$trainingData),
fit.xgb_dm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'dm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_excl_dis.rds')
getTrainPerf(fit.xgb_dm_excl_dis)
fit.xgb_dm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 300
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 142 300 6 0.1 0.01 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_dm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_dm_excl_dis$pred$obs,
pred = fit.xgb_dm_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'dm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
fit.xgb_dm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_excl_dis$pred$obs ~
fit.xgb_dm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_excl_dis$trainingData),
fit.xgb_dm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'dm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_dis_ind.rds')
getTrainPerf(fit.xgb_dm_dis_ind)
fit.xgb_dm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.7 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 294 700 6 0.3 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_dis_ind$pred$obs,
pred = fit.xgb_dm_dis_ind$pred$pred,
disaster = fit.xgb_dm_dis_ind$trainingData$disaster[fit.xgb_dm_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_dm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'dm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
fit.xgb_dm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_dis_ind$pred$obs ~
fit.xgb_dm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_dis_ind$trainingData),
fit.xgb_dm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'dm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_full.rds')
getTrainPerf(fit.xgb_dm_full)
fit.xgb_dm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.6 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.001", colsample_bytree = "0.75", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 231 700 5 0.3 0.001 0.75 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_full$pred$obs,
pred = fit.xgb_dm_full$pred$pred,
disaster = if_else(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_dm_full$trainingData[fit.xgb_dm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_dm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'dm',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
train_adj_r2 <- adj_r2(getTrainPerf(fit.xgb_dm_full)$TrainRsquared,
nrow(fit.xgb_dm_full$trainingData),
fit.xgb_dm_full$finalModel$nfeatures)
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_full$trainingData),
fit.xgb_dm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'dm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_dm_log_h2s_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_sincefeb2022.rds')
getTrainPerf(fit.xgb_dm_log_h2s_sincefeb2022)
fit.xgb_dm_log_h2s_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 300
## nfeatures : 41
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 133 300 6 0.1 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_dm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_sincefeb2022$pred$obs ~
fit.xgb_dm_log_h2s_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData),
fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_sincefeb2022$trainingData),
fit.xgb_dm_log_h2s_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_dm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis)
fit.xgb_dm_log_h2s_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 793.6 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "4", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 500
## nfeatures : 31
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 32 500 4 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_dm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
fit.xgb_dm_log_h2s_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis$pred$obs ~
fit.xgb_dm_log_h2s_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_dis$trainingData),
fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_dis$trainingData),
fit.xgb_dm_log_h2s_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_dm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_excl_dis.rds')
getTrainPerf(fit.xgb_dm_log_h2s_excl_dis)
fit.xgb_dm_log_h2s_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 500
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 158 500 6 0.1 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_excl_dis$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_dm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
fit.xgb_dm_log_h2s_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_excl_dis$pred$obs ~
fit.xgb_dm_log_h2s_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData),
fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_excl_dis$trainingData),
fit.xgb_dm_log_h2s_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_dm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_dis_ind.rds')
getTrainPerf(fit.xgb_dm_log_h2s_dis_ind)
fit.xgb_dm_log_h2s_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 700
## nfeatures : 44
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321 700 6 0.3 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_dis_ind$pred$obs,
pred = fit.xgb_dm_log_h2s_dis_ind$pred$pred,
disaster = fit.xgb_dm_log_h2s_dis_ind$trainingData$disaster[fit.xgb_dm_log_h2s_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_dis_ind$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
fit.xgb_dm_log_h2s_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_dis_ind$pred$obs ~
fit.xgb_dm_log_h2s_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData),
fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_dis_ind$trainingData),
fit.xgb_dm_log_h2s_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_dm_log_h2s_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_dm_log_h2s_full.rds')
getTrainPerf(fit.xgb_dm_log_h2s_full)
fit.xgb_dm_log_h2s_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 2.8 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "6", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 700
## nfeatures : 43
## xNames : wd_avg ws_avg daily_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km daily_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity daily_temp daily_hum daily_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 321 700 6 0.3 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_dm_log_h2s_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_dm_log_h2s_full$pred$obs,
pred = fit.xgb_dm_log_h2s_full$pred$pred,
disaster = if_else(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_dm_log_h2s_full$trainingData[fit.xgb_dm_log_h2s_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_dm_log_h2s_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_dm_log_h2s_full$pred$obs),
pred = exp(fit.xgb_dm_log_h2s_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
fit.xgb_dm_log_h2s_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_dm_log_h2s_full$pred$obs ~
fit.xgb_dm_log_h2s_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_dm_log_h2s_full$trainingData),
fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_dm_log_h2s_full$trainingData),
fit.xgb_dm_log_h2s_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_dm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_ha_sincefeb2022)
fit.xgb_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
names <- c("Longitude" = "mon_utm_x",
"Latitude" = "mon_utm_y",
"Distance to Refinery" = "dist_ref",
"Angle to Refinery" = "angle_ref",
"Active Wells within 2km" = "active_2km",
"Inactive Wells within 2km" = "inactive_2km",
"Monthly Oil Production 2km" = "monthly_oil_2km",
"Monthly Gas Production 2km" = "monthly_gas_2km",
"Distance to WRP" = "dist_wrp",
"WRP Capacity" = "closest_wrp_capacity",
"Angle to WRP" = "angle_wrp",
"Distance to Dominguez Channel" = "dist_dc",
"Hourly Temperature" = "hourly_temp",
"Hourly Humidity" = "hourly_hum",
"Hourly Precipitation" = "hourly_precip",
"Hourly Wind Speed" = "ws_avg",
"Hourly Wind Direction" = "wd_avg",
"Downwind Refinery" = "hourly_downwind_ref",
"Downwind WRP" = "hourly_downwind_wrp",
"Elevation" = "elevation",
"Enhanced Vegetation Index" = "EVI",
"Number of Daily Odor Complaints" = "num_odor_complaints",
"2020" = "year_2020",
"2021" = "year_2021",
"2022" = "year_2022",
"2023" = "year_2023",
"January" = "month_01",
"February" = "month_02",
"March" = "month_03",
"April" = "month_04",
"May" = "month_05",
"June" = "month_06",
"July" = "month_07",
"August" = "month_08",
"September" = "month_09",
"October" = "month_10",
"November" = "month_11",
"December" = "month_12",
"Monday" = "weekday_Mon",
"Tuesday" = "weekday_Tue",
"Wednesday" = "weekday_Wed",
"Thursday" = "weekday_Thu",
"Friday" = "weekday_Fri",
"Saturday" = "weekday_Sat",
"Sunday" = "weekday_Sun",
"Disaster" = "disaster")
imp<-varImp(fit.xgb_ha_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_ha_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_sincefeb2022$pred$obs, pred = fit.xgb_ha_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'ha',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
fit.xgb_ha_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_sincefeb2022$pred$obs ~
fit.xgb_ha_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_sincefeb2022$trainingData),
fit.xgb_ha_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'ha',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis.rds')
getTrainPerf(fit.xgb_ha_dis)
fit.xgb_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 84.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 50
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 51 50 4 0.4 0.001 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_ha_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_ha_dis$pred$obs, pred = fit.xgb_ha_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'ha',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_ha_dis$pred$obs ~
fit.xgb_ha_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_dis$pred$obs ~
fit.xgb_ha_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_dis$trainingData),
fit.xgb_ha_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'ha',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_excl_dis.rds')
getTrainPerf(fit.xgb_ha_excl_dis)
fit.xgb_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.1 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 52 150 4 0.4 0.001 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_ha_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_ha_excl_dis$pred$obs,
pred = fit.xgb_ha_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'ha',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
fit.xgb_ha_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_excl_dis$pred$obs ~
fit.xgb_ha_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_excl_dis$trainingData),
fit.xgb_ha_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'ha',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_dis_ind.rds')
getTrainPerf(fit.xgb_ha_dis_ind)
fit.xgb_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62 150 4 0.4 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_ha_dis_ind$pred$obs,
pred = fit.xgb_ha_dis_ind$pred$pred,
disaster = fit.xgb_ha_dis_ind$trainingData$disaster[fit.xgb_ha_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_ha_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'ha',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
fit.xgb_ha_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_ha_dis_ind$pred$obs ~
fit.xgb_ha_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_dis_ind$trainingData),
fit.xgb_ha_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'ha',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_ha_full.rds')
getTrainPerf(fit.xgb_ha_full)
fit.xgb_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_ha_full$pred$obs,
pred = fit.xgb_ha_full$pred$pred,
disaster = if_else(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_ha_full$trainingData[fit.xgb_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_ha_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'ha',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_ha_full$trainingData),
fit.xgb_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'ha',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_log_ha_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_ha_sincefeb2022)
fit.xgb_log_ha_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_sincefeb2022$pred$obs),
pred = exp(fit.xgb_log_ha_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_ha',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
fit.xgb_log_ha_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_sincefeb2022$pred$obs ~
fit.xgb_log_ha_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_sincefeb2022$trainingData),
fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_sincefeb2022$trainingData),
fit.xgb_log_ha_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_ha',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis.rds')
getTrainPerf(fit.xgb_log_ha_dis)
fit.xgb_log_ha_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 250.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 150
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis$pred$obs),
pred = exp(fit.xgb_log_ha_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_ha',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
fit.xgb_log_ha_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_dis$pred$obs ~
fit.xgb_log_ha_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_dis$trainingData),
fit.xgb_log_ha_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_dis$trainingData),
fit.xgb_log_ha_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_ha',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_excl_dis.rds')
getTrainPerf(fit.xgb_log_ha_excl_dis)
fit.xgb_log_ha_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_excl_dis$pred$obs),
pred = exp(fit.xgb_log_ha_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_ha',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
fit.xgb_log_ha_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_excl_dis$pred$obs ~
fit.xgb_log_ha_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_excl_dis$trainingData),
fit.xgb_log_ha_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_excl_dis$trainingData),
fit.xgb_log_ha_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_ha',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_dis_ind.rds')
getTrainPerf(fit.xgb_log_ha_dis_ind)
fit.xgb_log_ha_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_ha_dis_ind$pred$obs,
pred = fit.xgb_log_ha_dis_ind$pred$pred,
disaster = fit.xgb_log_ha_dis_ind$trainingData$disaster[fit.xgb_log_ha_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_dis_ind$pred$obs),
pred = exp(fit.xgb_log_ha_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
fit.xgb_log_ha_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_dis_ind$pred$obs ~
fit.xgb_log_ha_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_dis_ind$trainingData),
fit.xgb_log_ha_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_dis_ind$trainingData),
fit.xgb_log_ha_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_ha_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_ha_full.rds')
getTrainPerf(fit.xgb_log_ha_full)
fit.xgb_log_ha_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 62 150 4 0.4 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_ha_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_ha_full$pred$obs,
pred = fit.xgb_log_ha_full$pred$pred,
disaster = if_else(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_log_ha_full$trainingData[fit.xgb_log_ha_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_ha_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_ha_full$pred$obs),
pred = exp(fit.xgb_log_ha_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
fit.xgb_log_ha_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_ha_full$pred$obs ~
fit.xgb_log_ha_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_ha_full$trainingData),
fit.xgb_log_ha_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_ha_full$trainingData),
fit.xgb_log_ha_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_ha',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_hm_sincefeb2022)
fit.xgb_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 251.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 30 150 4 0.2 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# we use savePredictions = 'final' to store the predictions on the test set at each fold
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_hm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_sincefeb2022$pred$obs, pred = fit.xgb_hm_sincefeb2022$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'hm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
fit.xgb_hm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_sincefeb2022$pred$obs ~
fit.xgb_hm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_sincefeb2022$trainingData),
fit.xgb_hm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'hm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis.rds')
getTrainPerf(fit.xgb_hm_dis)
fit.xgb_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 86.2 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.2", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 50
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 29 50 4 0.2 0.01 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_hm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
# geom_abline(intercept = 0, slope = 1, color = 'red') +
# geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = fit.xgb_hm_dis$pred$obs, pred = fit.xgb_hm_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'hm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_hm_dis$pred$obs ~
fit.xgb_hm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_dis$pred$obs ~
fit.xgb_hm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_dis$trainingData),
fit.xgb_hm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'hm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_excl_dis<- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_excl_dis.rds')
getTrainPerf(fit.xgb_hm_excl_dis)
fit.xgb_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "0.8", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 60 150 4 0.4 0.01 0.8 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
fold_stat <- fit.xgb_hm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = fit.xgb_hm_excl_dis$pred$obs,
pred = fit.xgb_hm_excl_dis$pred$pred),
aes(x = pred, y = obs)) +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for exclude disaster XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_excl_dis_obs_vs_pred_plot
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'hm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
fit.xgb_hm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_excl_dis$pred$obs ~
fit.xgb_hm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_excl_dis$trainingData),
fit.xgb_hm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'hm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_dis_ind.rds')
getTrainPerf(fit.xgb_hm_dis_ind)
fit.xgb_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_hm_dis_ind$pred$obs,
pred = fit.xgb_hm_dis_ind$pred$pred,
disaster = fit.xgb_hm_dis_ind$trainingData$disaster[fit.xgb_hm_dis_ind$pred$rowIndex])
fold_stat <- fit.xgb_hm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything with disaster indicator XGBoost') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
theme_bw()
xgb_dis_ind_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test_result_data$obs,
pred = test_result_data$pred,
disaster = test_result_data$disaster),
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'hm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
fit.xgb_hm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_hm_dis_ind$pred$obs ~
fit.xgb_hm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_dis_ind$trainingData),
fit.xgb_hm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'hm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_hm_full.rds')
getTrainPerf(fit.xgb_hm_full)
fit.xgb_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 248.7 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 54 150 4 0.4 0.001 1 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_hm_full$pred$obs,
pred = fit.xgb_hm_full$pred$pred,
disaster = if_else(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_hm_full$trainingData[fit.xgb_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
fold_stat <- fit.xgb_hm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs))
test_r2 <- mean(fold_stat$R2)
test_rmse <- mean(fold_stat$RMSE)
xgb_everything_obs_vs_pred_plot <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(test_result_data,
aes(x = pred, y = obs)) +
geom_point(aes(col = factor(disaster)), show.legend = FALSE) +
stat_poly_line() +
stat_poly_eq(use_label(c("eq", "R2", "n"))) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'hm',
daterange = 'full',
'Coef' = summary(lm(test_result_data$obs ~
test_result_data$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(test_result_data$obs ~
test_result_data$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_hm_full$trainingData),
fit.xgb_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'hm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = NA,
'RMSE' = test_rmse,
'BT RMSE' = NA))
fit.xgb_log_hm_sincefeb2022 <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_sincefeb2022.rds')
getTrainPerf(fit.xgb_log_hm_sincefeb2022)
fit.xgb_log_hm_sincefeb2022$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 252.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 41
## niter: 150
## nfeatures : 41
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_sincefeb2022,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_sincefeb2022$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_sincefeb2022_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_sincefeb2022$pred$obs),
pred = exp(fit.xgb_log_hm_sincefeb2022$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Since Februrary 2022') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.05) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.15) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_hm',
daterange = 'sincefeb2022',
'Coef' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
fit.xgb_log_hm_sincefeb2022$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_sincefeb2022$pred$obs ~
fit.xgb_log_hm_sincefeb2022$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_sincefeb2022$trainingData),
fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_sincefeb2022$trainingData),
fit.xgb_log_hm_sincefeb2022$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
model_response_name = 'log_hm',
daterange = 'sincefeb2022',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis.rds')
getTrainPerf(fit.xgb_log_hm_dis)
fit.xgb_log_hm_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 249.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 150
## nfeatures : 31
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_10 month_11 month_12 year_2021 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis$pred$obs),
pred = exp(fit.xgb_log_hm_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.18) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_hm',
daterange = 'dis',
'Coef' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
fit.xgb_log_hm_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_dis$pred$obs ~
fit.xgb_log_hm_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_dis$trainingData),
fit.xgb_log_hm_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_dis$trainingData),
fit.xgb_log_hm_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
model_response_name = 'log_hm',
daterange = 'dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_excl_dis <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_excl_dis.rds')
getTrainPerf(fit.xgb_log_hm_excl_dis)
fit.xgb_log_hm_excl_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.8 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_excl_dis,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_excl_dis$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_excl_dis_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_excl_dis$pred$obs),
pred = exp(fit.xgb_log_hm_excl_dis$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Exclude Disaster') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) + stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) + stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_hm',
daterange = 'excl_dis',
'Coef' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
fit.xgb_log_hm_excl_dis$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_excl_dis$pred$obs ~
fit.xgb_log_hm_excl_dis$pred$pred))$r.squared,
'Disaster RMSE' = NA,
'Normal RMSE' = NA))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_excl_dis$trainingData),
fit.xgb_log_hm_excl_dis$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_excl_dis$trainingData),
fit.xgb_log_hm_excl_dis$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Exclude Disaster',
model_response_name = 'log_hm',
daterange = 'excl_dis',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_dis_ind <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_dis_ind.rds')
getTrainPerf(fit.xgb_log_hm_dis_ind)
fit.xgb_log_hm_dis_ind$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 253.3 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.001", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 44
## niter: 150
## nfeatures : 44
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip disaster month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 56 150 4 0.4 0.001 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_dis_ind,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_hm_dis_ind$pred$obs,
pred = fit.xgb_log_hm_dis_ind$pred$pred,
disaster = fit.xgb_log_hm_dis_ind$trainingData$disaster[fit.xgb_log_hm_dis_ind$pred$rowIndex])
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_dis_ind$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_dis_ind_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_dis_ind$pred$obs),
pred = exp(fit.xgb_log_hm_dis_ind$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w. Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.03) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.08) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.17) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'dis_ind',
'Coef' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
fit.xgb_log_hm_dis_ind$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_dis_ind$pred$obs ~
fit.xgb_log_hm_dis_ind$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_dis_ind$trainingData),
fit.xgb_log_hm_dis_ind$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_dis_ind$trainingData),
fit.xgb_log_hm_dis_ind$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w. Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'dis_ind',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
fit.xgb_log_hm_full <- readRDS('../rfiles/xgboost_v2/fit.xgb_log_hm_full.rds')
getTrainPerf(fit.xgb_log_hm_full)
fit.xgb_log_hm_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 254 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.4", max_depth = "4", gamma = "0.01", colsample_bytree = "1", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 43
## niter: 150
## nfeatures : 43
## xNames : wd_avg ws_avg hourly_downwind_ref dist_wrp dist_ref mon_utm_x mon_utm_y monthly_oil_2km monthly_gas_2km active_2km inactive_2km hourly_downwind_wrp elevation EVI num_odor_complaints dist_dc closest_wrp_capacity hourly_temp hourly_hum hourly_precip month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 year_2020 year_2021 year_2022 year_2023 weekday_Sun weekday_Mon weekday_Tue weekday_Wed weekday_Thu weekday_Fri weekday_Sat
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 64 150 4 0.4 0.01 1 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_log_hm_full,scale=FALSE)
# rename variables
imp <- tibble(variable = rownames(imp$importance), importance = imp$importance$Overall) %>%
pivot_wider(names_from = variable,
values_from = importance) %>%
rename(any_of(names)) %>%
pivot_longer(cols = everything(),names_to = 'variable', values_to = 'importance')
imp %>%
top_n(15, importance) %>%
ggplot(aes(x=reorder(variable, importance), y=importance)) +
geom_point() +
geom_segment(aes(x=variable,xend=variable,y=0,yend=importance)) +
ylab("importance") +
xlab("Variable") +
coord_flip() +
theme_minimal()
test_result_data <- tibble(obs = fit.xgb_log_hm_full$pred$obs,
pred = fit.xgb_log_hm_full$pred$pred,
disaster = if_else(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$year_2021 == 1 &
(fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_10 == 1 |
fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_11 == 1 |
fit.xgb_log_hm_full$trainingData[fit.xgb_log_hm_full$pred$rowIndex, ]$month_12 == 1), 1, 0))
# Here, we compute the R2 and RMSE for each fold and take the average
fold_stat <- fit.xgb_log_hm_full$pred %>% group_by(Resample) %>%
summarise(R2 = R2(pred, obs), RMSE = RMSE(pred, obs),
R2_BT = R2(exp(pred), exp(obs)), RMSE_BT = RMSE(exp(pred), exp(obs)))
test_r2 <- mean(fold_stat$R2)
test_r2_bt <- mean(fold_stat$R2_BT)
test_rmse <- mean(fold_stat$RMSE)
test_rmse_bt <- mean(fold_stat$RMSE_BT)
log_h2s_xgb_full_obs_vs_pred_plot <- ggplot(tibble(obs = exp(fit.xgb_log_hm_full$pred$obs),
pred = exp(fit.xgb_log_hm_full$pred$pred)),
aes(x = pred, y = obs)) +
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point() +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator') +
stat_poly_line() +
stat_poly_eq(use_label(c("eq")), label.x = "right", label.y = 0.15) +
stat_poly_eq(use_label(c("R2")), label.x = "right", label.y = 0.1) +
stat_poly_eq(use_label(c("n")), label.x = "right", label.y = 0.05) +
theme_bw()
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'full',
'Coef' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
fit.xgb_log_hm_full$pred$pred))$coefficients[2, 1],
'R-Sq' = summary(lm(fit.xgb_log_hm_full$pred$obs ~
fit.xgb_log_hm_full$pred$pred))$r.squared,
'Disaster RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 1)],
test_result_data$obs[which(test_result_data$disaster == 1)]),
'Normal RMSE' = RMSE(test_result_data$pred[which(test_result_data$disaster == 0)],
test_result_data$obs[which(test_result_data$disaster == 0)])))
test_adj_r2 <- adj_r2(test_r2,
nrow(fit.xgb_log_hm_full$trainingData),
fit.xgb_log_hm_full$finalModel$nfeatures)
BT_adj_r2 <- adj_r2(test_r2_bt,
nrow(fit.xgb_log_hm_full$trainingData),
fit.xgb_log_hm_full$finalModel$nfeatures)
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
model_response_name = 'log_hm',
daterange = 'full',
'R-Sq' = test_adj_r2,
'BT R-Sq' = BT_adj_r2,
'RMSE' = test_rmse,
'BT RMSE' = test_rmse_bt))
validation_result
xgb_result
base_validation_result <- validation_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == '') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
arrange(factor(Model, levels = unique(validation_result$Model)))
log_validation_result <- validation_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == 'Log') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max'))
base_xgb_result <- xgb_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == '') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
select(-transformation, -`BT R-Sq`, -`BT RMSE`)
log_xgb_result <- xgb_result %>%
mutate(transformation = if_else(str_detect(model_response_name, 'log_'), 'Log', '')) %>%
filter(transformation == 'Log') %>%
mutate(response_base = case_when(str_detect(model_response_name, 'da') ~ 'Daily Avg',
str_detect(model_response_name, 'dm') ~ 'Daily Max',
str_detect(model_response_name, 'ha') ~ 'Hourly Avg',
str_detect(model_response_name, 'hm') ~ 'Hourly Max')) %>%
select(-transformation)
# This is the result from regressing the observed on predicted
options(knitr.kable.NA = '')
validation_result_table <- base_validation_result %>%
left_join(log_validation_result, join_by(Model, response_base)) %>%
select(all_of(c('response_base', 'Coef.x', 'R-Sq.x', 'Normal RMSE.x',
'Disaster RMSE.x', 'Coef.y', 'R-Sq.y', 'Normal RMSE.y',
'Disaster RMSE.y'))) %>%
setNames(c('Response', 'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE',
'Coef', 'R2', 'Normal RMSE', 'Disaster RMSE'))
validation_result_table %>%
knitr::kable(format = 'pipe', digits = 3, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 1, 'No Transformation' = 4, 'Log-Transformation' = 4)) %>%
kable_styling()
| Response | Coef | R2 | Normal RMSE | Disaster RMSE | Coef | R2 | Normal RMSE | Disaster RMSE |
|---|---|---|---|---|---|---|---|---|
| Since Feb 2022 | ||||||||
| Daily Avg | 1.012 | 0.759 | 1.019 | 0.814 | ||||
| Daily Max | 0.743 | 0.207 | 0.992 | 0.614 | ||||
| Hourly Avg | 1.011 | 0.533 | 1.028 | 0.698 | ||||
| Hourly Max | 1.033 | 0.365 | 1.025 | 0.638 | ||||
| Disaster Only | ||||||||
| Daily Avg | 1.172 | 0.642 | 1.015 | 0.864 | ||||
| Daily Max | 1.041 | 0.532 | 1.020 | 0.821 | ||||
| Hourly Avg | 0.925 | 0.434 | 1.002 | 0.771 | ||||
| Hourly Max | 0.894 | 0.485 | 1.000 | 0.750 | ||||
| Exclude Disaster | ||||||||
| Daily Avg | 1.004 | 0.673 | 1.025 | 0.792 | ||||
| Daily Max | 0.793 | 0.137 | 0.984 | 0.608 | ||||
| Hourly Avg | 1.033 | 0.403 | 1.047 | 0.653 | ||||
| Hourly Max | 0.965 | 0.276 | 1.043 | 0.604 | ||||
| Everything w D.I | ||||||||
| Daily Avg | 1.089 | 0.965 | 0.281 | 17.744 | 1.009 | 0.977 | 0.121 | 0.135 |
| Daily Max | 0.997 | 0.998 | 1.047 | 27.285 | 1.008 | 0.961 | 0.202 | 0.205 |
| Hourly Avg | 1.013 | 0.940 | 1.686 | 32.007 | 1.043 | 0.680 | 0.496 | 0.669 |
| Hourly Max | 1.042 | 0.915 | 4.313 | 81.977 | 1.043 | 0.642 | 0.567 | 0.719 |
| Everything w.o D.I | ||||||||
| Daily Avg | 1.049 | 0.989 | 0.137 | 9.919 | 1.011 | 0.975 | 0.121 | 0.198 |
| Daily Max | 0.990 | 0.994 | 5.095 | 50.451 | 1.010 | 0.958 | 0.212 | 0.197 |
| Hourly Avg | 1.030 | 0.958 | 2.041 | 26.420 | 1.045 | 0.679 | 0.496 | 0.674 |
| Hourly Max | 0.948 | 0.874 | 4.301 | 100.490 | 1.046 | 0.641 | 0.567 | 0.728 |
full_result_table_fordisp <- base_table %>%
left_join(log_table %>% select(-n), join_by(date_names, response_base)) %>%
left_join(base_xgb_result, join_by(date_names == Model, response_base)) %>%
left_join(log_xgb_result, join_by(date_names == Model, response_base)) %>%
select(-starts_with('model_response_name'), -starts_with('daterange'), -'date_names') %>%
select(all_of(c('response_base', 'n', 'adjr2.x', 'p.x', 'adjr2.y', 'bt_adjr2', 'p.y', 'R-Sq.x', 'RMSE.x', 'R-Sq.y', 'RMSE.y', 'BT R-Sq', 'BT RMSE'))) %>%
setNames(c('Response','N', 'R2', 'P', 'R2', 'BT R2', 'P',
c('R2', 'RMSE'),
c('R2', 'RMSE', 'BT R2', 'BT RMSE')))
full_result_table_kable <- full_result_table_fordisp %>%
knitr::kable(format = 'latex', digits = 2) %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6))
writeLines(gam_result_table_kable, '../figures/gam_result_table.tex')
full_result_table_fordisp %>%
knitr::kable(format = 'pipe', digits = 4, table.attr = "style='width:100%;'") %>%
pack_rows(index = c("Since Feb 2022" = 4, "Disaster Only" = 4,
"Exclude Disaster" = 4, "Everything w D.I" = 4, "Everything w.o D.I" = 4)) %>%
add_header_above(c(' ' = 7, 'Test' = 2, 'Test' = 4)) %>%
add_header_above(c(' ' = 2, 'No Transformation' = 2, 'Log-Transformation' = 3, 'No Transformation' = 2, 'Log-Transformation' = 4)) %>%
add_header_above(c(' ' = 2, 'GAM' = 5, 'XGBoost' = 6)) %>%
kable_styling()
| Response | N | R2 | P | R2 | BT R2 | P | R2 | RMSE | R2 | RMSE | BT R2 | BT RMSE |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Since Feb 2022 | ||||||||||||
| Daily Avg | 6531 | 0.6187 | 127 | 0.7001 | 0.6174 | 126 | 0.7614 | 0.2478 | 0.8127 | 0.3059 | 0.7637 | 0.2478 |
| Daily Max | 6531 | 0.2086 | 126 | 0.5438 | 0.2177 | 125 | 0.2879 | 3.1244 | 0.6123 | 0.5722 | 0.3557 | 2.9330 |
| Hourly Avg | 153718 | 0.3521 | 133 | 0.5480 | 0.3414 | 132 | 0.5345 | 0.4892 | 0.6975 | 0.4493 | 0.5042 | 0.5114 |
| Hourly Max | 153718 | 0.1748 | 132 | 0.4962 | 0.1611 | 132 | 0.3769 | 1.0293 | 0.6383 | 0.5400 | 0.2907 | 1.1047 |
| Disaster Only | ||||||||||||
| Daily Avg | 1273 | 0.4170 | 105 | 0.7458 | 0.5459 | 111 | 0.5257 | 40.7997 | 0.8532 | 0.4274 | 0.8037 | 30.9226 |
| Daily Max | 1273 | 0.4320 | 99 | 0.7246 | 0.4399 | 110 | 0.7240 | 360.8105 | 0.8146 | 0.6128 | 0.7604 | 296.5671 |
| Hourly Avg | 30242 | 0.2097 | 114 | 0.5830 | 0.2170 | 114 | 0.4512 | 96.7972 | 0.7705 | 0.6075 | 0.4515 | 97.9899 |
| Hourly Max | 30242 | 0.2229 | 114 | 0.5601 | 0.2360 | 114 | 0.5473 | 196.2375 | 0.7492 | 0.6631 | 0.4520 | 213.3409 |
| Exclude Disaster | ||||||||||||
| Daily Avg | 14322 | 0.4568 | 128 | 0.5671 | 0.4575 | 129 | 0.7043 | 0.3188 | 0.7915 | 0.3326 | 0.6951 | 0.3258 |
| Daily Max | 14322 | 0.1230 | 123 | 0.4810 | 0.1071 | 128 | 0.2722 | 3.4251 | 0.6073 | 0.5725 | 0.3417 | 3.2564 |
| Hourly Avg | 337596 | 0.2458 | 132 | 0.4425 | 0.2478 | 133 | 0.4896 | 0.6074 | 0.6526 | 0.4940 | 0.4561 | 0.6354 |
| Hourly Max | 337596 | 0.1484 | 131 | 0.4097 | 0.1404 | 133 | 0.3360 | 1.1044 | 0.6043 | 0.5672 | 0.2918 | 1.1416 |
| Everything w D.I | ||||||||||||
| Daily Avg | 15595 | 0.1136 | 123 | 0.5511 | -0.0078 | 128 | 0.9816 | 2.5865 | 0.9775 | 0.1219 | 0.9812 | 1.6211 |
| Daily Max | 15595 | 0.1235 | 123 | 0.5032 | -0.0079 | 128 | 0.9829 | 5.5020 | 0.9609 | 0.2016 | 0.9980 | 4.3671 |
| Hourly Avg | 367838 | 0.0488 | 134 | 0.4381 | 0.0016 | 133 | 0.9322 | 8.6180 | 0.6801 | 0.5121 | 0.4724 | 29.6837 |
| Hourly Max | 367838 | 0.0508 | 133 | 0.4124 | 0.0006 | 134 | 0.9223 | 21.4359 | 0.6415 | 0.5812 | 0.4736 | 62.7187 |
| Everything w.o D.I | ||||||||||||
| Daily Avg | 15595 | 0.1123 | 122 | 0.5436 | -0.0078 | 127 | 0.9968 | 1.1273 | 0.9747 | 0.1289 | 0.9073 | 4.5043 |
| Daily Max | 15595 | 0.1223 | 123 | 0.4937 | -0.0079 | 128 | 0.9797 | 8.7597 | 0.9574 | 0.2098 | 0.9966 | 7.8280 |
| Hourly Avg | 367838 | 0.0483 | 133 | 0.4348 | 0.0012 | 132 | 0.9491 | 7.5139 | 0.6790 | 0.5131 | 0.4320 | 29.1288 |
| Hourly Max | 367838 | 0.0503 | 133 | 0.4090 | 0.0004 | 133 | 0.8837 | 25.5470 | 0.6408 | 0.5819 | 0.4550 | 63.2933 |